|
|
|
/*
|
|
|
|
* Copyright (C) 2013-2023 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
|
|
|
|
* Copyright (C) 2007-2013 Sourcefire, Inc.
|
|
|
|
*
|
|
|
|
* Authors: Tomasz Kojm
|
|
|
|
*
|
|
|
|
* Acknowledgements: The header structures were based upon "ELF: Executable
|
|
|
|
* and Linkable Format, Portable Formats Specification,
|
|
|
|
* Version 1.1".
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
|
|
* MA 02110-1301, USA.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#if HAVE_CONFIG_H
|
|
|
|
#include "clamav-config.h"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <fcntl.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#ifdef HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
|
|
|
#include <time.h>
|
|
|
|
|
|
|
|
#include "elf.h"
|
|
|
|
#include "clamav.h"
|
|
|
|
#include "execs.h"
|
|
|
|
#include "matcher.h"
|
|
|
|
#include "scanners.h"
|
|
|
|
|
|
|
|
#define EC16(v, conv) (conv ? cbswap16(v) : v)
|
|
|
|
#define EC32(v, conv) (conv ? cbswap32(v) : v)
|
|
|
|
#define EC64(v, conv) (conv ? cbswap64(v) : v)
|
|
|
|
|
|
|
|
#define CLI_TMPUNLK() \
|
|
|
|
if (!ctx->engine->keeptmp) { \
|
|
|
|
if (cli_unlink(tempfile)) { \
|
|
|
|
free(tempfile); \
|
|
|
|
return CL_EUNLINK; \
|
|
|
|
} \
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cli_elf_sectionlog(uint32_t sh_type, uint32_t sh_flags);
|
|
|
|
|
|
|
|
static uint32_t cli_rawaddr32(uint32_t vaddr, struct elf_program_hdr32 *ph, uint16_t phnum, uint8_t conv, uint8_t *err)
|
|
|
|
{
|
|
|
|
uint16_t i, found = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < phnum; i++) {
|
|
|
|
if (EC32(ph[i].p_vaddr, conv) <= vaddr && EC32(ph[i].p_vaddr, conv) + EC32(ph[i].p_memsz, conv) > vaddr) {
|
|
|
|
found = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!found) {
|
|
|
|
*err = 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
*err = 0;
|
|
|
|
return vaddr - EC32(ph[i].p_vaddr, conv) + EC32(ph[i].p_offset, conv);
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t cli_rawaddr64(uint64_t vaddr, struct elf_program_hdr64 *ph, uint16_t phnum, uint8_t conv, uint8_t *err)
|
|
|
|
{
|
|
|
|
uint16_t i, found = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < phnum; i++) {
|
|
|
|
if (EC64(ph[i].p_vaddr, conv) <= vaddr && EC64(ph[i].p_vaddr, conv) + EC64(ph[i].p_memsz, conv) > vaddr) {
|
|
|
|
found = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!found) {
|
|
|
|
*err = 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
*err = 0;
|
|
|
|
return vaddr - EC64(ph[i].p_vaddr, conv) + EC64(ph[i].p_offset, conv);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return converted endian-fixed header, or error code */
|
PE, ELF, Mach-O: code cleanup
The header parsing / executable metadata collecting functions for the
PE, ELF, and Mach-O file types were using `int` for the return type.
Mostly they were returning 0 for success and -1, -2, -3, or -4 for
failure. But in some cases they were returning cl_error_t enum values
for failure. Regardless, the function using them was treating 0 as
success and non-zero as failure, which it stored as -1 ... every time.
This commit switches them all to use cl_error_t. I am continuing to
storeo the final result as 0 / -1 in the `peinfo` struct, but outside of
that everything has been made consistent.
While I was working on that, I got a tad side tracked. I noticed that
the target type isn't an enum, or even a set of #defines. So I made an
enum and then changed the code that uses target types to use the enum.
I also removed the `target` parameter from a number of functions that
don't actually use it at all. Some recursion was masking the fact that
it was an unused parameter which is why there was no warning about it.
3 years ago
|
|
|
static cl_error_t cli_elf_fileheader(cli_ctx *ctx, fmap_t *map, union elf_file_hdr *file_hdr,
|
|
|
|
uint8_t *do_convert, uint8_t *is64)
|
|
|
|
{
|
|
|
|
uint8_t format64, conv;
|
|
|
|
|
|
|
|
/* Load enough for smaller header first */
|
|
|
|
if (fmap_readn(map, file_hdr, 0, sizeof(struct elf_file_hdr32)) != sizeof(struct elf_file_hdr32)) {
|
|
|
|
/* Not an ELF file? */
|
|
|
|
cli_dbgmsg("ELF: Can't read file header\n");
|
|
|
|
return CL_BREAK;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (memcmp(file_hdr->hdr64.e_ident, "\x7f\x45\x4c\x46", 4)) {
|
|
|
|
cli_dbgmsg("ELF: Not an ELF file\n");
|
|
|
|
return CL_BREAK;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (file_hdr->hdr64.e_ident[4]) {
|
|
|
|
case 1:
|
|
|
|
cli_dbgmsg("ELF: ELF class 1 (32-bit)\n");
|
|
|
|
format64 = 0;
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
cli_dbgmsg("ELF: ELF class 2 (64-bit)\n");
|
|
|
|
format64 = 1;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
cli_dbgmsg("ELF: Unknown ELF class (%u)\n", file_hdr->hdr64.e_ident[4]);
|
|
|
|
if (ctx && SCAN_HEURISTIC_BROKEN && (CL_VIRUS == cli_append_potentially_unwanted(ctx, "Heuristics.Broken.Executable"))) {
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │ └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
| └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │ └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │ └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │ └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │ └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │ └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │ └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
| └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │ └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
| └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
4 years ago
|
|
|
return CL_VIRUS;
|
|
|
|
}
|
|
|
|
return CL_BREAK;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Need to know to endian convert */
|
|
|
|
if (file_hdr->hdr64.e_ident[5] == 1) {
|
|
|
|
#if WORDS_BIGENDIAN == 0
|
|
|
|
if (ctx)
|
|
|
|
cli_dbgmsg("ELF: File is little-endian - conversion not required\n");
|
|
|
|
conv = 0;
|
|
|
|
#else
|
|
|
|
if (ctx)
|
|
|
|
cli_dbgmsg("ELF: File is little-endian - data conversion enabled\n");
|
|
|
|
conv = 1;
|
|
|
|
#endif
|
|
|
|
} else {
|
|
|
|
#if WORDS_BIGENDIAN == 0
|
|
|
|
if (ctx)
|
|
|
|
cli_dbgmsg("ELF: File is big-endian - data conversion enabled\n");
|
|
|
|
conv = 1;
|
|
|
|
#else
|
|
|
|
if (ctx)
|
|
|
|
cli_dbgmsg("ELF: File is big-endian - conversion not required\n");
|
|
|
|
conv = 0;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
*do_convert = conv;
|
|
|
|
*is64 = format64;
|
|
|
|
|
|
|
|
/* Solve bit-size and conversion pronto */
|
|
|
|
file_hdr->hdr64.e_type = EC16(file_hdr->hdr64.e_type, conv);
|
|
|
|
file_hdr->hdr64.e_machine = EC16(file_hdr->hdr64.e_machine, conv);
|
|
|
|
file_hdr->hdr64.e_version = EC32(file_hdr->hdr64.e_version, conv);
|
|
|
|
|
|
|
|
if (format64) {
|
|
|
|
/* Read rest of 64-bit header */
|
|
|
|
if (fmap_readn(map, file_hdr->hdr32.pad, sizeof(struct elf_file_hdr32), ELF_HDR_SIZEDIFF) != ELF_HDR_SIZEDIFF) {
|
|
|
|
/* Not an ELF file? */
|
|
|
|
cli_dbgmsg("ELF: Can't read file header\n");
|
|
|
|
return CL_BREAK;
|
|
|
|
}
|
|
|
|
/* Now endian convert, if needed */
|
|
|
|
if (conv) {
|
|
|
|
file_hdr->hdr64.e_entry = EC64(file_hdr->hdr64.e_entry, conv);
|
|
|
|
file_hdr->hdr64.e_phoff = EC64(file_hdr->hdr64.e_phoff, conv);
|
|
|
|
file_hdr->hdr64.e_shoff = EC64(file_hdr->hdr64.e_shoff, conv);
|
|
|
|
file_hdr->hdr64.e_flags = EC32(file_hdr->hdr64.e_flags, conv);
|
|
|
|
file_hdr->hdr64.e_ehsize = EC16(file_hdr->hdr64.e_ehsize, conv);
|
|
|
|
file_hdr->hdr64.e_phentsize = EC16(file_hdr->hdr64.e_phentsize, conv);
|
|
|
|
file_hdr->hdr64.e_phnum = EC16(file_hdr->hdr64.e_phnum, conv);
|
|
|
|
file_hdr->hdr64.e_shentsize = EC16(file_hdr->hdr64.e_shentsize, conv);
|
|
|
|
file_hdr->hdr64.e_shnum = EC16(file_hdr->hdr64.e_shnum, conv);
|
|
|
|
file_hdr->hdr64.e_shstrndx = EC16(file_hdr->hdr64.e_shstrndx, conv);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* Convert 32-bit structure, if needed */
|
|
|
|
if (conv) {
|
|
|
|
file_hdr->hdr32.hdr.e_entry = EC32(file_hdr->hdr32.hdr.e_entry, conv);
|
|
|
|
file_hdr->hdr32.hdr.e_phoff = EC32(file_hdr->hdr32.hdr.e_phoff, conv);
|
|
|
|
file_hdr->hdr32.hdr.e_shoff = EC32(file_hdr->hdr32.hdr.e_shoff, conv);
|
|
|
|
file_hdr->hdr32.hdr.e_flags = EC32(file_hdr->hdr32.hdr.e_flags, conv);
|
|
|
|
file_hdr->hdr32.hdr.e_ehsize = EC16(file_hdr->hdr32.hdr.e_ehsize, conv);
|
|
|
|
file_hdr->hdr32.hdr.e_phentsize = EC16(file_hdr->hdr32.hdr.e_phentsize, conv);
|
|
|
|
file_hdr->hdr32.hdr.e_phnum = EC16(file_hdr->hdr32.hdr.e_phnum, conv);
|
|
|
|
file_hdr->hdr32.hdr.e_shentsize = EC16(file_hdr->hdr32.hdr.e_shentsize, conv);
|
|
|
|
file_hdr->hdr32.hdr.e_shnum = EC16(file_hdr->hdr32.hdr.e_shnum, conv);
|
|
|
|
file_hdr->hdr32.hdr.e_shstrndx = EC16(file_hdr->hdr32.hdr.e_shstrndx, conv);
|
|
|
|
}
|
|
|
|
/* Wipe pad for safety */
|
|
|
|
memset(file_hdr->hdr32.pad, 0, ELF_HDR_SIZEDIFF);
|
|
|
|
}
|
|
|
|
|
|
|
|
return CL_CLEAN;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Read 32-bit program headers */
|
|
|
|
static int cli_elf_ph32(cli_ctx *ctx, fmap_t *map, struct cli_exe_info *elfinfo,
|
|
|
|
struct elf_file_hdr32 *file_hdr, uint8_t conv)
|
|
|
|
{
|
|
|
|
struct elf_program_hdr32 *program_hdr = NULL;
|
|
|
|
uint16_t phnum, phentsize;
|
|
|
|
uint32_t entry, fentry = 0, phoff;
|
|
|
|
uint32_t i;
|
|
|
|
uint8_t err;
|
|
|
|
|
|
|
|
/* Program headers and Entry */
|
|
|
|
phnum = file_hdr->e_phnum;
|
|
|
|
cli_dbgmsg("ELF: Number of program headers: %d\n", phnum);
|
|
|
|
if (phnum > 128) {
|
|
|
|
cli_dbgmsg("ELF: Suspicious number of program headers\n");
|
|
|
|
if (ctx && SCAN_HEURISTIC_BROKEN && (CL_VIRUS == cli_append_potentially_unwanted(ctx, "Heuristics.Broken.Executable"))) {
|
|
|
|
return CL_VIRUS;
|
|
|
|
}
|
|
|
|
return CL_EFORMAT;
|
|
|
|
}
|
|
|
|
entry = file_hdr->e_entry;
|
|
|
|
|
|
|
|
if (phnum && entry) {
|
|
|
|
phentsize = file_hdr->e_phentsize;
|
|
|
|
/* Sanity check */
|
|
|
|
if (phentsize != sizeof(struct elf_program_hdr32)) {
|
|
|
|
cli_dbgmsg("ELF: phentsize != sizeof(struct elf_program_hdr32)\n");
|
|
|
|
if (ctx && SCAN_HEURISTIC_BROKEN && (CL_VIRUS == cli_append_potentially_unwanted(ctx, "Heuristics.Broken.Executable"))) {
|
|
|
|
return CL_VIRUS;
|
|
|
|
}
|
|
|
|
return CL_EFORMAT;
|
|
|
|
}
|
|
|
|
|
|
|
|
phoff = file_hdr->e_phoff;
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("ELF: Program header table offset: %u\n", phoff);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (phnum) {
|
|
|
|
program_hdr = (struct elf_program_hdr32 *)cli_calloc(phnum, sizeof(struct elf_program_hdr32));
|
|
|
|
if (!program_hdr) {
|
|
|
|
cli_errmsg("ELF: Can't allocate memory for program headers\n");
|
|
|
|
return CL_EMEM;
|
|
|
|
}
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("------------------------------------\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < phnum; i++) {
|
|
|
|
err = 0;
|
|
|
|
if (fmap_readn(map, &program_hdr[i], phoff, sizeof(struct elf_program_hdr32)) != sizeof(struct elf_program_hdr32))
|
|
|
|
err = 1;
|
|
|
|
phoff += sizeof(struct elf_program_hdr32);
|
|
|
|
|
|
|
|
if (err) {
|
|
|
|
cli_dbgmsg("ELF: Can't read segment #%d\n", i);
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("ELF: Possibly broken ELF file\n");
|
|
|
|
}
|
|
|
|
free(program_hdr);
|
|
|
|
if (ctx && SCAN_HEURISTIC_BROKEN && (CL_VIRUS == cli_append_potentially_unwanted(ctx, "Heuristics.Broken.Executable"))) {
|
|
|
|
return CL_VIRUS;
|
|
|
|
}
|
|
|
|
return CL_BREAK;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("ELF: Segment #%d\n", i);
|
|
|
|
cli_dbgmsg("ELF: Segment type: 0x%x\n", EC32(program_hdr[i].p_type, conv));
|
|
|
|
cli_dbgmsg("ELF: Segment offset: 0x%x\n", EC32(program_hdr[i].p_offset, conv));
|
|
|
|
cli_dbgmsg("ELF: Segment virtual address: 0x%x\n", EC32(program_hdr[i].p_vaddr, conv));
|
|
|
|
cli_dbgmsg("ELF: Segment real size: 0x%x\n", EC32(program_hdr[i].p_filesz, conv));
|
|
|
|
cli_dbgmsg("ELF: Segment virtual size: 0x%x\n", EC32(program_hdr[i].p_memsz, conv));
|
|
|
|
cli_dbgmsg("------------------------------------\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fentry = cli_rawaddr32(entry, program_hdr, phnum, conv, &err);
|
|
|
|
free(program_hdr);
|
|
|
|
if (err) {
|
|
|
|
cli_dbgmsg("ELF: Can't calculate file offset of entry point\n");
|
|
|
|
if (ctx && SCAN_HEURISTIC_BROKEN && (CL_VIRUS == cli_append_potentially_unwanted(ctx, "Heuristics.Broken.Executable"))) {
|
|
|
|
return CL_VIRUS;
|
|
|
|
}
|
|
|
|
return CL_EFORMAT;
|
|
|
|
}
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("ELF: Entry point address: 0x%.8x\n", entry);
|
|
|
|
cli_dbgmsg("ELF: Entry point offset: 0x%.8x (%d)\n", fentry, fentry);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (elfinfo) {
|
|
|
|
elfinfo->ep = fentry;
|
|
|
|
}
|
|
|
|
|
|
|
|
return CL_CLEAN;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Read 64-bit program headers */
|
PE, ELF, Mach-O: code cleanup
The header parsing / executable metadata collecting functions for the
PE, ELF, and Mach-O file types were using `int` for the return type.
Mostly they were returning 0 for success and -1, -2, -3, or -4 for
failure. But in some cases they were returning cl_error_t enum values
for failure. Regardless, the function using them was treating 0 as
success and non-zero as failure, which it stored as -1 ... every time.
This commit switches them all to use cl_error_t. I am continuing to
storeo the final result as 0 / -1 in the `peinfo` struct, but outside of
that everything has been made consistent.
While I was working on that, I got a tad side tracked. I noticed that
the target type isn't an enum, or even a set of #defines. So I made an
enum and then changed the code that uses target types to use the enum.
I also removed the `target` parameter from a number of functions that
don't actually use it at all. Some recursion was masking the fact that
it was an unused parameter which is why there was no warning about it.
3 years ago
|
|
|
static cl_error_t cli_elf_ph64(cli_ctx *ctx, fmap_t *map, struct cli_exe_info *elfinfo,
|
|
|
|
struct elf_file_hdr64 *file_hdr, uint8_t conv)
|
|
|
|
{
|
|
|
|
struct elf_program_hdr64 *program_hdr = NULL;
|
|
|
|
uint16_t phnum, phentsize;
|
|
|
|
uint64_t entry, fentry = 0, phoff;
|
|
|
|
uint32_t i;
|
|
|
|
uint8_t err;
|
|
|
|
|
|
|
|
/* Program headers and Entry */
|
|
|
|
phnum = file_hdr->e_phnum;
|
|
|
|
cli_dbgmsg("ELF: Number of program headers: %d\n", phnum);
|
|
|
|
if (phnum > 128) {
|
|
|
|
cli_dbgmsg("ELF: Suspicious number of program headers\n");
|
|
|
|
if (ctx && SCAN_HEURISTIC_BROKEN && (CL_VIRUS == cli_append_potentially_unwanted(ctx, "Heuristics.Broken.Executable"))) {
|
|
|
|
return CL_VIRUS;
|
|
|
|
}
|
|
|
|
return CL_EFORMAT;
|
|
|
|
}
|
|
|
|
entry = file_hdr->e_entry;
|
|
|
|
|
|
|
|
if (phnum && entry) {
|
|
|
|
phentsize = file_hdr->e_phentsize;
|
|
|
|
/* Sanity check */
|
|
|
|
if (phentsize != sizeof(struct elf_program_hdr64)) {
|
|
|
|
cli_dbgmsg("ELF: phentsize != sizeof(struct elf_program_hdr64)\n");
|
|
|
|
if (ctx && SCAN_HEURISTIC_BROKEN && (CL_VIRUS == cli_append_potentially_unwanted(ctx, "Heuristics.Broken.Executable"))) {
|
|
|
|
return CL_VIRUS;
|
|
|
|
}
|
|
|
|
return CL_EFORMAT;
|
|
|
|
}
|
|
|
|
|
|
|
|
phoff = file_hdr->e_phoff;
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("ELF: Program header table offset: " STDu64 "\n", phoff);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (phnum) {
|
|
|
|
program_hdr = (struct elf_program_hdr64 *)cli_calloc(phnum, sizeof(struct elf_program_hdr64));
|
|
|
|
if (!program_hdr) {
|
|
|
|
cli_errmsg("ELF: Can't allocate memory for program headers\n");
|
|
|
|
return CL_EMEM;
|
|
|
|
}
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("------------------------------------\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < phnum; i++) {
|
|
|
|
err = 0;
|
|
|
|
if (fmap_readn(map, &program_hdr[i], phoff, sizeof(struct elf_program_hdr64)) != sizeof(struct elf_program_hdr64))
|
|
|
|
err = 1;
|
|
|
|
phoff += sizeof(struct elf_program_hdr64);
|
|
|
|
|
|
|
|
if (err) {
|
|
|
|
cli_dbgmsg("ELF: Can't read segment #%d\n", i);
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("ELF: Possibly broken ELF file\n");
|
|
|
|
}
|
|
|
|
free(program_hdr);
|
|
|
|
if (ctx && SCAN_HEURISTIC_BROKEN && (CL_VIRUS == cli_append_potentially_unwanted(ctx, "Heuristics.Broken.Executable"))) {
|
|
|
|
return CL_VIRUS;
|
|
|
|
}
|
|
|
|
return CL_BREAK;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("ELF: Segment #%d\n", i);
|
|
|
|
cli_dbgmsg("ELF: Segment type: 0x" STDx32 "\n", (uint32_t)EC32(program_hdr[i].p_type, conv));
|
|
|
|
cli_dbgmsg("ELF: Segment offset: 0x" STDx64 "\n", (uint64_t)EC64(program_hdr[i].p_offset, conv));
|
|
|
|
cli_dbgmsg("ELF: Segment virtual address: 0x" STDx64 "\n", (uint64_t)EC64(program_hdr[i].p_vaddr, conv));
|
|
|
|
cli_dbgmsg("ELF: Segment real size: 0x" STDx64 "\n", (uint64_t)EC64(program_hdr[i].p_filesz, conv));
|
|
|
|
cli_dbgmsg("ELF: Segment virtual size: 0x" STDx64 "\n", (uint64_t)EC64(program_hdr[i].p_memsz, conv));
|
|
|
|
cli_dbgmsg("------------------------------------\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fentry = cli_rawaddr64(entry, program_hdr, phnum, conv, &err);
|
|
|
|
free(program_hdr);
|
|
|
|
if (err) {
|
|
|
|
cli_dbgmsg("ELF: Can't calculate file offset of entry point\n");
|
|
|
|
if (ctx && SCAN_HEURISTIC_BROKEN && (CL_VIRUS == cli_append_potentially_unwanted(ctx, "Heuristics.Broken.Executable"))) {
|
|
|
|
return CL_VIRUS;
|
|
|
|
}
|
|
|
|
return CL_EFORMAT;
|
|
|
|
}
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("ELF: Entry point address: 0x%.16" PRIx64 "\n", entry);
|
|
|
|
cli_dbgmsg("ELF: Entry point offset: 0x%.16" PRIx64 " (" STDi64 ")\n", fentry, fentry);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (elfinfo) {
|
|
|
|
elfinfo->ep = fentry;
|
|
|
|
}
|
|
|
|
|
|
|
|
return CL_CLEAN;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 32-bit version of section header parsing */
|
|
|
|
static int cli_elf_sh32(cli_ctx *ctx, fmap_t *map, struct cli_exe_info *elfinfo,
|
|
|
|
struct elf_file_hdr32 *file_hdr, uint8_t conv)
|
|
|
|
{
|
|
|
|
struct elf_section_hdr32 *section_hdr = NULL;
|
|
|
|
uint16_t shnum, shentsize;
|
|
|
|
uint32_t shoff, i;
|
|
|
|
|
|
|
|
shnum = file_hdr->e_shnum;
|
|
|
|
cli_dbgmsg("ELF: Number of sections: %d\n", shnum);
|
|
|
|
if (ctx && (shnum > 2048)) {
|
|
|
|
cli_dbgmsg("ELF: Number of sections > 2048, skipping\n");
|
|
|
|
return CL_BREAK;
|
|
|
|
} else if (elfinfo && (shnum > 256)) {
|
|
|
|
cli_dbgmsg("ELF: Suspicious number of sections\n");
|
|
|
|
return CL_BREAK;
|
|
|
|
}
|
|
|
|
if (elfinfo) {
|
|
|
|
elfinfo->nsections = shnum;
|
|
|
|
}
|
|
|
|
|
|
|
|
shentsize = file_hdr->e_shentsize;
|
|
|
|
/* Sanity check */
|
|
|
|
if (shentsize != sizeof(struct elf_section_hdr32)) {
|
|
|
|
cli_dbgmsg("ELF: shentsize != sizeof(struct elf_section_hdr32)\n");
|
|
|
|
if (ctx && SCAN_HEURISTIC_BROKEN && (CL_VIRUS == cli_append_potentially_unwanted(ctx, "Heuristics.Broken.Executable"))) {
|
|
|
|
return CL_VIRUS;
|
|
|
|
}
|
|
|
|
return CL_EFORMAT;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (elfinfo && !shnum) {
|
|
|
|
return CL_CLEAN;
|
|
|
|
}
|
|
|
|
|
|
|
|
shoff = file_hdr->e_shoff;
|
|
|
|
if (ctx)
|
|
|
|
cli_dbgmsg("ELF: Section header table offset: %d\n", shoff);
|
|
|
|
|
|
|
|
if (elfinfo) {
|
PE parsing code improvements, db loading bug fixes
Consolidate the PE parsing code into one function. I tried to preserve all existing functionality from the previous, distinct implementations to a large extent (with the exceptions mentioned below). If I noticed potential bugs/improvements, I added a TODO statement about those so that they can be fixed in a smaller commit later. Also, there are more TODOs in places where I'm not entirely sure why certain actions are performed - more research is needed for these.
I'm submitting a pull request now so that regression testing can be done, and because merging what I have thus far now will likely have fewer conflicts than if I try to merge later
PE parsing code improvements:
- PEs without all 16 data directories are parsed more appropriately now
- Added lots more debug statements
Also:
- Allow MAX_BC and MAX_TRACKED_PCRE to be specified via CFLAGS
When doing performance testing with the latest CVD, MAX_BC and
MAX_TRACKED_PCRE need to be raised to track all the events.
Allow these to be specified via CFLAGS by not redefining them
if they are already defined
- Fix an issue preventing wildcard sizes in .MDB/.MSB rules
I'm not sure what the original intent of the check I removed was,
but it prevents using wildcard sizes in .MDB/.MSB rules. AFAICT
these wildcard sizes should be handled appropriately by the MD5
section hash computation code, so I don't think a check on that
is needed.
- Fix several issues related to db loading
- .imp files will now get loaded if they exist in a directory passed
via clamscan's '-d' flag
- .pwdb files will now get loaded if they exist in a directory passed
via clamscan's '-d' flag even when compiling without yara support
- Changes to .imp, .ign, and .ign2 files will now be reflected in calls
to cl_statinidir and cl_statchkdir (and also .pwdb files, even when
compiling without yara support)
- The contents of .sfp files won't be included in some of the signature
counts, and the contents of .cud files will be
- Any local.gdb files will no longer be loaded twice
- For .imp files, you are no longer required to specify a minimum flevel for wildcard rules, since this isn't needed
6 years ago
|
|
|
elfinfo->sections = (struct cli_exe_section *)cli_calloc(shnum, sizeof(struct cli_exe_section));
|
|
|
|
if (!elfinfo->sections) {
|
|
|
|
cli_dbgmsg("ELF: Can't allocate memory for section headers\n");
|
|
|
|
return CL_EMEM;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (shnum) {
|
|
|
|
section_hdr = (struct elf_section_hdr32 *)cli_calloc(shnum, shentsize);
|
|
|
|
if (!section_hdr) {
|
|
|
|
cli_errmsg("ELF: Can't allocate memory for section headers\n");
|
|
|
|
return CL_EMEM;
|
|
|
|
}
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("------------------------------------\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Loop over section headers */
|
|
|
|
for (i = 0; i < shnum; i++) {
|
|
|
|
uint32_t sh_type, sh_flags;
|
|
|
|
|
|
|
|
if (fmap_readn(map, §ion_hdr[i], shoff, sizeof(struct elf_section_hdr32)) != sizeof(struct elf_section_hdr32)) {
|
|
|
|
cli_dbgmsg("ELF: Can't read section header\n");
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("ELF: Possibly broken ELF file\n");
|
|
|
|
}
|
|
|
|
free(section_hdr);
|
|
|
|
if (ctx && SCAN_HEURISTIC_BROKEN && (CL_VIRUS == cli_append_potentially_unwanted(ctx, "Heuristics.Broken.Executable"))) {
|
|
|
|
return CL_VIRUS;
|
|
|
|
}
|
|
|
|
return CL_BREAK;
|
|
|
|
}
|
|
|
|
|
|
|
|
shoff += sizeof(struct elf_section_hdr32);
|
|
|
|
|
|
|
|
if (elfinfo) {
|
PE parsing code improvements, db loading bug fixes
Consolidate the PE parsing code into one function. I tried to preserve all existing functionality from the previous, distinct implementations to a large extent (with the exceptions mentioned below). If I noticed potential bugs/improvements, I added a TODO statement about those so that they can be fixed in a smaller commit later. Also, there are more TODOs in places where I'm not entirely sure why certain actions are performed - more research is needed for these.
I'm submitting a pull request now so that regression testing can be done, and because merging what I have thus far now will likely have fewer conflicts than if I try to merge later
PE parsing code improvements:
- PEs without all 16 data directories are parsed more appropriately now
- Added lots more debug statements
Also:
- Allow MAX_BC and MAX_TRACKED_PCRE to be specified via CFLAGS
When doing performance testing with the latest CVD, MAX_BC and
MAX_TRACKED_PCRE need to be raised to track all the events.
Allow these to be specified via CFLAGS by not redefining them
if they are already defined
- Fix an issue preventing wildcard sizes in .MDB/.MSB rules
I'm not sure what the original intent of the check I removed was,
but it prevents using wildcard sizes in .MDB/.MSB rules. AFAICT
these wildcard sizes should be handled appropriately by the MD5
section hash computation code, so I don't think a check on that
is needed.
- Fix several issues related to db loading
- .imp files will now get loaded if they exist in a directory passed
via clamscan's '-d' flag
- .pwdb files will now get loaded if they exist in a directory passed
via clamscan's '-d' flag even when compiling without yara support
- Changes to .imp, .ign, and .ign2 files will now be reflected in calls
to cl_statinidir and cl_statchkdir (and also .pwdb files, even when
compiling without yara support)
- The contents of .sfp files won't be included in some of the signature
counts, and the contents of .cud files will be
- Any local.gdb files will no longer be loaded twice
- For .imp files, you are no longer required to specify a minimum flevel for wildcard rules, since this isn't needed
6 years ago
|
|
|
elfinfo->sections[i].rva = EC32(section_hdr[i].sh_addr, conv);
|
|
|
|
elfinfo->sections[i].raw = EC32(section_hdr[i].sh_offset, conv);
|
|
|
|
elfinfo->sections[i].rsz = EC32(section_hdr[i].sh_size, conv);
|
|
|
|
}
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("ELF: Section %u\n", i);
|
|
|
|
cli_dbgmsg("ELF: Section offset: %u\n", EC32(section_hdr[i].sh_offset, conv));
|
|
|
|
cli_dbgmsg("ELF: Section size: %u\n", EC32(section_hdr[i].sh_size, conv));
|
|
|
|
|
|
|
|
sh_type = EC32(section_hdr[i].sh_type, conv);
|
|
|
|
sh_flags = EC32(section_hdr[i].sh_flags, conv) & ELF_SHF_MASK;
|
|
|
|
cli_elf_sectionlog(sh_type, sh_flags);
|
|
|
|
|
|
|
|
cli_dbgmsg("------------------------------------\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
free(section_hdr);
|
|
|
|
return CL_CLEAN;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 64-bit version of section header parsing */
|
|
|
|
static int cli_elf_sh64(cli_ctx *ctx, fmap_t *map, struct cli_exe_info *elfinfo,
|
|
|
|
struct elf_file_hdr64 *file_hdr, uint8_t conv)
|
|
|
|
{
|
|
|
|
struct elf_section_hdr64 *section_hdr = NULL;
|
|
|
|
uint16_t shnum, shentsize;
|
|
|
|
uint32_t i;
|
|
|
|
uint64_t shoff;
|
|
|
|
|
|
|
|
shnum = file_hdr->e_shnum;
|
|
|
|
cli_dbgmsg("ELF: Number of sections: %d\n", shnum);
|
|
|
|
if (ctx && (shnum > 2048)) {
|
|
|
|
cli_dbgmsg("ELF: Number of sections > 2048, skipping\n");
|
|
|
|
return CL_BREAK;
|
|
|
|
} else if (elfinfo && (shnum > 256)) {
|
|
|
|
cli_dbgmsg("ELF: Suspicious number of sections\n");
|
|
|
|
return CL_BREAK;
|
|
|
|
}
|
|
|
|
if (elfinfo) {
|
|
|
|
elfinfo->nsections = shnum;
|
|
|
|
}
|
|
|
|
|
|
|
|
shentsize = file_hdr->e_shentsize;
|
|
|
|
/* Sanity check */
|
|
|
|
if (shentsize != sizeof(struct elf_section_hdr64)) {
|
|
|
|
cli_dbgmsg("ELF: shentsize != sizeof(struct elf_section_hdr64)\n");
|
|
|
|
if (ctx && SCAN_HEURISTIC_BROKEN && (CL_VIRUS == cli_append_potentially_unwanted(ctx, "Heuristics.Broken.Executable"))) {
|
|
|
|
return CL_VIRUS;
|
|
|
|
}
|
|
|
|
return CL_EFORMAT;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (elfinfo && !shnum) {
|
|
|
|
return CL_CLEAN;
|
|
|
|
}
|
|
|
|
|
|
|
|
shoff = file_hdr->e_shoff;
|
|
|
|
if (ctx)
|
|
|
|
cli_dbgmsg("ELF: Section header table offset: " STDu64 "\n", shoff);
|
|
|
|
|
|
|
|
if (elfinfo) {
|
PE parsing code improvements, db loading bug fixes
Consolidate the PE parsing code into one function. I tried to preserve all existing functionality from the previous, distinct implementations to a large extent (with the exceptions mentioned below). If I noticed potential bugs/improvements, I added a TODO statement about those so that they can be fixed in a smaller commit later. Also, there are more TODOs in places where I'm not entirely sure why certain actions are performed - more research is needed for these.
I'm submitting a pull request now so that regression testing can be done, and because merging what I have thus far now will likely have fewer conflicts than if I try to merge later
PE parsing code improvements:
- PEs without all 16 data directories are parsed more appropriately now
- Added lots more debug statements
Also:
- Allow MAX_BC and MAX_TRACKED_PCRE to be specified via CFLAGS
When doing performance testing with the latest CVD, MAX_BC and
MAX_TRACKED_PCRE need to be raised to track all the events.
Allow these to be specified via CFLAGS by not redefining them
if they are already defined
- Fix an issue preventing wildcard sizes in .MDB/.MSB rules
I'm not sure what the original intent of the check I removed was,
but it prevents using wildcard sizes in .MDB/.MSB rules. AFAICT
these wildcard sizes should be handled appropriately by the MD5
section hash computation code, so I don't think a check on that
is needed.
- Fix several issues related to db loading
- .imp files will now get loaded if they exist in a directory passed
via clamscan's '-d' flag
- .pwdb files will now get loaded if they exist in a directory passed
via clamscan's '-d' flag even when compiling without yara support
- Changes to .imp, .ign, and .ign2 files will now be reflected in calls
to cl_statinidir and cl_statchkdir (and also .pwdb files, even when
compiling without yara support)
- The contents of .sfp files won't be included in some of the signature
counts, and the contents of .cud files will be
- Any local.gdb files will no longer be loaded twice
- For .imp files, you are no longer required to specify a minimum flevel for wildcard rules, since this isn't needed
6 years ago
|
|
|
elfinfo->sections = (struct cli_exe_section *)cli_calloc(shnum, sizeof(struct cli_exe_section));
|
|
|
|
if (!elfinfo->sections) {
|
|
|
|
cli_dbgmsg("ELF: Can't allocate memory for section headers\n");
|
|
|
|
return CL_EMEM;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (shnum) {
|
|
|
|
section_hdr = (struct elf_section_hdr64 *)cli_calloc(shnum, shentsize);
|
|
|
|
if (!section_hdr) {
|
|
|
|
cli_errmsg("ELF: Can't allocate memory for section headers\n");
|
|
|
|
return CL_EMEM;
|
|
|
|
}
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("------------------------------------\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Loop over section headers */
|
|
|
|
for (i = 0; i < shnum; i++) {
|
|
|
|
uint32_t sh_type, sh_flags;
|
|
|
|
|
|
|
|
if (fmap_readn(map, §ion_hdr[i], shoff, sizeof(struct elf_section_hdr64)) != sizeof(struct elf_section_hdr64)) {
|
|
|
|
cli_dbgmsg("ELF: Can't read section header\n");
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("ELF: Possibly broken ELF file\n");
|
|
|
|
}
|
|
|
|
free(section_hdr);
|
|
|
|
if (ctx && SCAN_HEURISTIC_BROKEN && (CL_VIRUS == cli_append_potentially_unwanted(ctx, "Heuristics.Broken.Executable"))) {
|
|
|
|
return CL_VIRUS;
|
|
|
|
}
|
|
|
|
return CL_BREAK;
|
|
|
|
}
|
|
|
|
|
|
|
|
shoff += sizeof(struct elf_section_hdr64);
|
|
|
|
|
|
|
|
if (elfinfo) {
|
PE parsing code improvements, db loading bug fixes
Consolidate the PE parsing code into one function. I tried to preserve all existing functionality from the previous, distinct implementations to a large extent (with the exceptions mentioned below). If I noticed potential bugs/improvements, I added a TODO statement about those so that they can be fixed in a smaller commit later. Also, there are more TODOs in places where I'm not entirely sure why certain actions are performed - more research is needed for these.
I'm submitting a pull request now so that regression testing can be done, and because merging what I have thus far now will likely have fewer conflicts than if I try to merge later
PE parsing code improvements:
- PEs without all 16 data directories are parsed more appropriately now
- Added lots more debug statements
Also:
- Allow MAX_BC and MAX_TRACKED_PCRE to be specified via CFLAGS
When doing performance testing with the latest CVD, MAX_BC and
MAX_TRACKED_PCRE need to be raised to track all the events.
Allow these to be specified via CFLAGS by not redefining them
if they are already defined
- Fix an issue preventing wildcard sizes in .MDB/.MSB rules
I'm not sure what the original intent of the check I removed was,
but it prevents using wildcard sizes in .MDB/.MSB rules. AFAICT
these wildcard sizes should be handled appropriately by the MD5
section hash computation code, so I don't think a check on that
is needed.
- Fix several issues related to db loading
- .imp files will now get loaded if they exist in a directory passed
via clamscan's '-d' flag
- .pwdb files will now get loaded if they exist in a directory passed
via clamscan's '-d' flag even when compiling without yara support
- Changes to .imp, .ign, and .ign2 files will now be reflected in calls
to cl_statinidir and cl_statchkdir (and also .pwdb files, even when
compiling without yara support)
- The contents of .sfp files won't be included in some of the signature
counts, and the contents of .cud files will be
- Any local.gdb files will no longer be loaded twice
- For .imp files, you are no longer required to specify a minimum flevel for wildcard rules, since this isn't needed
6 years ago
|
|
|
elfinfo->sections[i].rva = EC64(section_hdr[i].sh_addr, conv);
|
|
|
|
elfinfo->sections[i].raw = EC64(section_hdr[i].sh_offset, conv);
|
|
|
|
elfinfo->sections[i].rsz = EC64(section_hdr[i].sh_size, conv);
|
|
|
|
}
|
|
|
|
if (ctx) {
|
|
|
|
cli_dbgmsg("ELF: Section " STDu32 "\n", (uint32_t)i);
|
|
|
|
cli_dbgmsg("ELF: Section offset: " STDu64 "\n", (uint64_t)EC64(section_hdr[i].sh_offset, conv));
|
|
|
|
cli_dbgmsg("ELF: Section size: " STDu64 "\n", (uint64_t)EC64(section_hdr[i].sh_size, conv));
|
|
|
|
|
|
|
|
sh_type = EC32(section_hdr[i].sh_type, conv);
|
|
|
|
sh_flags = (uint32_t)(EC64(section_hdr[i].sh_flags, conv) & ELF_SHF_MASK);
|
|
|
|
cli_elf_sectionlog(sh_type, sh_flags);
|
|
|
|
|
|
|
|
cli_dbgmsg("------------------------------------\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
free(section_hdr);
|
|
|
|
return CL_CLEAN;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Print section type and selected flags to the log */
|
|
|
|
static void cli_elf_sectionlog(uint32_t sh_type, uint32_t sh_flags)
|
|
|
|
{
|
|
|
|
switch (sh_type) {
|
|
|
|
case 0x6: /* SHT_DYNAMIC */
|
|
|
|
cli_dbgmsg("ELF: Section type: Dynamic linking information\n");
|
|
|
|
break;
|
|
|
|
case 0xb: /* SHT_DYNSYM */
|
|
|
|
cli_dbgmsg("ELF: Section type: Symbols for dynamic linking\n");
|
|
|
|
break;
|
|
|
|
case 0xf: /* SHT_FINI_ARRAY */
|
|
|
|
cli_dbgmsg("ELF: Section type: Array of pointers to termination functions\n");
|
|
|
|
break;
|
|
|
|
case 0x5: /* SHT_HASH */
|
|
|
|
cli_dbgmsg("ELF: Section type: Symbol hash table\n");
|
|
|
|
break;
|
|
|
|
case 0xe: /* SHT_INIT_ARRAY */
|
|
|
|
cli_dbgmsg("ELF: Section type: Array of pointers to initialization functions\n");
|
|
|
|
break;
|
|
|
|
case 0x8: /* SHT_NOBITS */
|
|
|
|
cli_dbgmsg("ELF: Section type: Empty section (NOBITS)\n");
|
|
|
|
break;
|
|
|
|
case 0x7: /* SHT_NOTE */
|
|
|
|
cli_dbgmsg("ELF: Section type: Note section\n");
|
|
|
|
break;
|
|
|
|
case 0x0: /* SHT_NULL */
|
|
|
|
cli_dbgmsg("ELF: Section type: Null (no associated section)\n");
|
|
|
|
break;
|
|
|
|
case 0x10: /* SHT_PREINIT_ARRAY */
|
|
|
|
cli_dbgmsg("ELF: Section type: Array of pointers to preinit functions\n");
|
|
|
|
break;
|
|
|
|
case 0x1: /* SHT_PROGBITS */
|
|
|
|
cli_dbgmsg("ELF: Section type: Program information\n");
|
|
|
|
break;
|
|
|
|
case 0x9: /* SHT_REL */
|
|
|
|
cli_dbgmsg("ELF: Section type: Relocation entries w/o explicit addends\n");
|
|
|
|
break;
|
|
|
|
case 0x4: /* SHT_RELA */
|
|
|
|
cli_dbgmsg("ELF: Section type: Relocation entries with explicit addends\n");
|
|
|
|
break;
|
|
|
|
case 0x3: /* SHT_STRTAB */
|
|
|
|
cli_dbgmsg("ELF: Section type: String table\n");
|
|
|
|
break;
|
|
|
|
case 0x2: /* SHT_SYMTAB */
|
|
|
|
cli_dbgmsg("ELF: Section type: Symbol table\n");
|
|
|
|
break;
|
|
|
|
case 0x6ffffffd: /* SHT_GNU_verdef */
|
|
|
|
cli_dbgmsg("ELF: Section type: Provided symbol versions\n");
|
|
|
|
break;
|
|
|
|
case 0x6ffffffe: /* SHT_GNU_verneed */
|
|
|
|
cli_dbgmsg("ELF: Section type: Required symbol versions\n");
|
|
|
|
break;
|
|
|
|
case 0x6fffffff: /* SHT_GNU_versym */
|
|
|
|
cli_dbgmsg("ELF: Section type: Symbol Version Table\n");
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
cli_dbgmsg("ELF: Section type: Unknown\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sh_flags & ELF_SHF_WRITE)
|
|
|
|
cli_dbgmsg("ELF: Section contains writable data\n");
|
|
|
|
|
|
|
|
if (sh_flags & ELF_SHF_ALLOC)
|
|
|
|
cli_dbgmsg("ELF: Section occupies memory\n");
|
|
|
|
|
|
|
|
if (sh_flags & ELF_SHF_EXECINSTR)
|
|
|
|
cli_dbgmsg("ELF: Section contains executable code\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Scan function for ELF */
|
|
|
|
cl_error_t cli_scanelf(cli_ctx *ctx)
|
|
|
|
{
|
|
|
|
union elf_file_hdr file_hdr;
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │ └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
| └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │ └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │ └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │ └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │ └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │ └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │ └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
| └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │ └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
| └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
4 years ago
|
|
|
fmap_t *map = ctx->fmap;
|
|
|
|
cl_error_t ret;
|
|
|
|
uint8_t conv = 0, is64 = 0;
|
|
|
|
|
|
|
|
cli_dbgmsg("in cli_scanelf\n");
|
|
|
|
|
|
|
|
/* Load header to determine size and class */
|
|
|
|
ret = cli_elf_fileheader(ctx, map, &file_hdr, &conv, &is64);
|
|
|
|
if (ret == CL_BREAK) {
|
|
|
|
return CL_CLEAN; /* here, break means "exit but report clean" */
|
|
|
|
} else if (ret != CL_CLEAN) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Log File type and machine type */
|
|
|
|
switch (file_hdr.hdr64.e_type) {
|
|
|
|
case 0x0: /* ET_NONE */
|
|
|
|
cli_dbgmsg("ELF: File type: None\n");
|
|
|
|
break;
|
|
|
|
case 0x1: /* ET_REL */
|
|
|
|
cli_dbgmsg("ELF: File type: Relocatable\n");
|
|
|
|
break;
|
|
|
|
case 0x2: /* ET_EXEC */
|
|
|
|
cli_dbgmsg("ELF: File type: Executable\n");
|
|
|
|
break;
|
|
|
|
case 0x3: /* ET_DYN */
|
|
|
|
cli_dbgmsg("ELF: File type: Core\n");
|
|
|
|
break;
|
|
|
|
case 0x4: /* ET_CORE */
|
|
|
|
cli_dbgmsg("ELF: File type: Core\n");
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
cli_dbgmsg("ELF: File type: Unknown (%d)\n", file_hdr.hdr64.e_type);
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (file_hdr.hdr64.e_machine) {
|
|
|
|
/* Due to a huge list, we only include the most popular machines here */
|
|
|
|
case 0: /* EM_NONE */
|
|
|
|
cli_dbgmsg("ELF: Machine type: None\n");
|
|
|
|
break;
|
|
|
|
case 2: /* EM_SPARC */
|
|
|
|
cli_dbgmsg("ELF: Machine type: SPARC\n");
|
|
|
|
break;
|
|
|
|
case 3: /* EM_386 */
|
|
|
|
cli_dbgmsg("ELF: Machine type: Intel 80386\n");
|
|
|
|
break;
|
|
|
|
case 4: /* EM_68K */
|
|
|
|
cli_dbgmsg("ELF: Machine type: Motorola 68000\n");
|
|
|
|
break;
|
|
|
|
case 8: /* EM_MIPS */
|
|
|
|
cli_dbgmsg("ELF: Machine type: MIPS RS3000\n");
|
|
|
|
break;
|
|
|
|
case 9: /* EM_S370 */
|
|
|
|
cli_dbgmsg("ELF: Machine type: IBM System/370\n");
|
|
|
|
break;
|
|
|
|
case 15: /* EM_PARISC */
|
|
|
|
cli_dbgmsg("ELF: Machine type: HPPA\n");
|
|
|
|
break;
|
|
|
|
case 20: /* EM_PPC */
|
|
|
|
cli_dbgmsg("ELF: Machine type: PowerPC\n");
|
|
|
|
break;
|
|
|
|
case 21: /* EM_PPC64 */
|
|
|
|
cli_dbgmsg("ELF: Machine type: PowerPC 64-bit\n");
|
|
|
|
break;
|
|
|
|
case 22: /* EM_S390 */
|
|
|
|
cli_dbgmsg("ELF: Machine type: IBM S390\n");
|
|
|
|
break;
|
|
|
|
case 40: /* EM_ARM */
|
|
|
|
cli_dbgmsg("ELF: Machine type: ARM\n");
|
|
|
|
break;
|
|
|
|
case 41: /* EM_FAKE_ALPHA */
|
|
|
|
cli_dbgmsg("ELF: Machine type: Digital Alpha\n");
|
|
|
|
break;
|
|
|
|
case 43: /* EM_SPARCV9 */
|
|
|
|
cli_dbgmsg("ELF: Machine type: SPARC v9 64-bit\n");
|
|
|
|
break;
|
|
|
|
case 50: /* EM_IA_64 */
|
|
|
|
cli_dbgmsg("ELF: Machine type: IA64\n");
|
|
|
|
break;
|
|
|
|
case 62: /* EM_X86_64 */
|
|
|
|
cli_dbgmsg("ELF: Machine type: AMD x86-64\n");
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
cli_dbgmsg("ELF: Machine type: Unknown (0x%x)\n", file_hdr.hdr64.e_machine);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Program headers and Entry */
|
|
|
|
if (is64) {
|
|
|
|
ret = cli_elf_ph64(ctx, map, NULL, &(file_hdr.hdr64), conv);
|
|
|
|
} else {
|
|
|
|
ret = cli_elf_ph32(ctx, map, NULL, &(file_hdr.hdr32.hdr), conv);
|
|
|
|
}
|
|
|
|
if (ret == CL_BREAK) {
|
|
|
|
return CL_CLEAN; /* break means "exit but report clean" */
|
|
|
|
} else if (ret != CL_CLEAN) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Sections */
|
|
|
|
if (is64) {
|
|
|
|
ret = cli_elf_sh64(ctx, map, NULL, &(file_hdr.hdr64), conv);
|
|
|
|
} else {
|
|
|
|
ret = cli_elf_sh32(ctx, map, NULL, &(file_hdr.hdr32.hdr), conv);
|
|
|
|
}
|
|
|
|
if (ret == CL_BREAK) {
|
|
|
|
return CL_CLEAN; /* break means "exit but report clean" */
|
|
|
|
} else if (ret != CL_CLEAN) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return CL_CLEAN;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ELF header parsing only
|
|
|
|
* Returns 0 on success, -1 on error
|
|
|
|
*/
|
PE, ELF, Mach-O: code cleanup
The header parsing / executable metadata collecting functions for the
PE, ELF, and Mach-O file types were using `int` for the return type.
Mostly they were returning 0 for success and -1, -2, -3, or -4 for
failure. But in some cases they were returning cl_error_t enum values
for failure. Regardless, the function using them was treating 0 as
success and non-zero as failure, which it stored as -1 ... every time.
This commit switches them all to use cl_error_t. I am continuing to
storeo the final result as 0 / -1 in the `peinfo` struct, but outside of
that everything has been made consistent.
While I was working on that, I got a tad side tracked. I noticed that
the target type isn't an enum, or even a set of #defines. So I made an
enum and then changed the code that uses target types to use the enum.
I also removed the `target` parameter from a number of functions that
don't actually use it at all. Some recursion was masking the fact that
it was an unused parameter which is why there was no warning about it.
3 years ago
|
|
|
cl_error_t cli_elfheader(cli_ctx *ctx, struct cli_exe_info *elfinfo)
|
|
|
|
{
|
|
|
|
union elf_file_hdr file_hdr;
|
|
|
|
uint8_t conv = 0, is64 = 0;
|
PE, ELF, Mach-O: code cleanup
The header parsing / executable metadata collecting functions for the
PE, ELF, and Mach-O file types were using `int` for the return type.
Mostly they were returning 0 for success and -1, -2, -3, or -4 for
failure. But in some cases they were returning cl_error_t enum values
for failure. Regardless, the function using them was treating 0 as
success and non-zero as failure, which it stored as -1 ... every time.
This commit switches them all to use cl_error_t. I am continuing to
storeo the final result as 0 / -1 in the `peinfo` struct, but outside of
that everything has been made consistent.
While I was working on that, I got a tad side tracked. I noticed that
the target type isn't an enum, or even a set of #defines. So I made an
enum and then changed the code that uses target types to use the enum.
I also removed the `target` parameter from a number of functions that
don't actually use it at all. Some recursion was masking the fact that
it was an unused parameter which is why there was no warning about it.
3 years ago
|
|
|
cl_error_t ret = CL_SUCCESS;
|
|
|
|
|
|
|
|
cli_dbgmsg("in cli_elfheader\n");
|
|
|
|
|
|
|
|
// TODO This code assumes elfinfo->offset == 0, which might not always
|
|
|
|
// be the case. For now just print this debug message and continue on
|
|
|
|
if (0 != elfinfo->offset) {
|
|
|
|
cli_dbgmsg("cli_elfheader: Assumption Violated: elfinfo->offset != 0\n");
|
|
|
|
}
|
|
|
|
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │ └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
| └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │ └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │ └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │ └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │ └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │ └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │ └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
| └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │ └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
| └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
4 years ago
|
|
|
ret = cli_elf_fileheader(NULL, ctx->fmap, &file_hdr, &conv, &is64);
|
PE, ELF, Mach-O: code cleanup
The header parsing / executable metadata collecting functions for the
PE, ELF, and Mach-O file types were using `int` for the return type.
Mostly they were returning 0 for success and -1, -2, -3, or -4 for
failure. But in some cases they were returning cl_error_t enum values
for failure. Regardless, the function using them was treating 0 as
success and non-zero as failure, which it stored as -1 ... every time.
This commit switches them all to use cl_error_t. I am continuing to
storeo the final result as 0 / -1 in the `peinfo` struct, but outside of
that everything has been made consistent.
While I was working on that, I got a tad side tracked. I noticed that
the target type isn't an enum, or even a set of #defines. So I made an
enum and then changed the code that uses target types to use the enum.
I also removed the `target` parameter from a number of functions that
don't actually use it at all. Some recursion was masking the fact that
it was an unused parameter which is why there was no warning about it.
3 years ago
|
|
|
if (ret != CL_SUCCESS) {
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Program headers and Entry */
|
|
|
|
if (is64) {
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │ └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
| └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │ └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │ └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │ └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │ └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │ └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │ └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
| └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │ └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
| └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
4 years ago
|
|
|
ret = cli_elf_ph64(NULL, ctx->fmap, elfinfo, &(file_hdr.hdr64), conv);
|
|
|
|
} else {
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │ └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
| └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │ └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │ └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │ └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │ └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │ └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │ └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
| └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │ └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
| └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
4 years ago
|
|
|
ret = cli_elf_ph32(NULL, ctx->fmap, elfinfo, &(file_hdr.hdr32.hdr), conv);
|
|
|
|
}
|
PE, ELF, Mach-O: code cleanup
The header parsing / executable metadata collecting functions for the
PE, ELF, and Mach-O file types were using `int` for the return type.
Mostly they were returning 0 for success and -1, -2, -3, or -4 for
failure. But in some cases they were returning cl_error_t enum values
for failure. Regardless, the function using them was treating 0 as
success and non-zero as failure, which it stored as -1 ... every time.
This commit switches them all to use cl_error_t. I am continuing to
storeo the final result as 0 / -1 in the `peinfo` struct, but outside of
that everything has been made consistent.
While I was working on that, I got a tad side tracked. I noticed that
the target type isn't an enum, or even a set of #defines. So I made an
enum and then changed the code that uses target types to use the enum.
I also removed the `target` parameter from a number of functions that
don't actually use it at all. Some recursion was masking the fact that
it was an unused parameter which is why there was no warning about it.
3 years ago
|
|
|
if (ret != CL_SUCCESS) {
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Section Headers */
|
|
|
|
if (is64) {
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │ └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
| └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │ └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │ └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │ └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │ └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │ └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │ └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
| └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │ └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
| └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
4 years ago
|
|
|
ret = cli_elf_sh64(NULL, ctx->fmap, elfinfo, &(file_hdr.hdr64), conv);
|
|
|
|
} else {
|
libclamav: Fix scan recursion tracking
Scan recursion is the process of identifying files embedded in other
files and then scanning them, recursively.
Internally this process is more complex than it may sound because a file
may have multiple layers of types before finding a new "file".
At present we treat the recursion count in the scanning context as an
index into both our fmap list AND our container list. These two lists
are conceptually a part of the same thing and should be unified.
But what's concerning is that the "recursion level" isn't actually
incremented or decremented at the same time that we add a layer to the
fmap or container lists but instead is more touchy-feely, increasing
when we find a new "file".
To account for this shadiness, the size of the fmap and container lists
has always been a little longer than our "max scan recursion" limit so
we don't accidentally overflow the fmap or container arrays (!).
I've implemented a single recursion-stack as an array, similar to before,
which includes a pointer to each fmap at each layer, along with the size
and type. Push and pop functions add and remove layers whenever a new
fmap is added. A boolean argument when pushing indicates if the new layer
represents a new buffer or new file (descriptor). A new buffer will reset
the "nested fmap level" (described below).
This commit also provides a solution for an issue where we detect
embedded files more than once during scan recursion.
For illustration, imagine a tarball named foo.tar.gz with this structure:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
But suppose baz.exe embeds a ZIP archive and a 7Z archive, like this:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| baz.exe | PE | 0 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| │ └── hello.txt | ASCII | 2 | 0 |
| └── sfx.7z | 7Z | 1 | 1 |
| └── world.txt | ASCII | 2 | 0 |
(A) If we scan for embedded files at any layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| ├── foo.tar | TAR | 1 | 0 |
| │ ├── bar.zip | ZIP | 2 | 1 |
| │ │ └── hola.txt | ASCII | 3 | 0 |
| │ ├── baz.exe | PE | 2 | 1 |
| │ │ ├── sfx.zip | ZIP | 3 | 1 |
| │ │ │ └── hello.txt | ASCII | 4 | 0 |
| │ │ └── sfx.7z | 7Z | 3 | 1 |
| │ │ └── world.txt | ASCII | 4 | 0 |
| │ ├── sfx.zip | ZIP | 2 | 1 |
| │ │ └── hello.txt | ASCII | 3 | 0 |
| │ └── sfx.7z | 7Z | 2 | 1 |
| │ └── world.txt | ASCII | 3 | 0 |
| ├── sfx.zip | ZIP | 1 | 1 |
| └── sfx.7z | 7Z | 1 | 1 |
(A) is bad because it scans content more than once.
Note that for the GZ layer, it may detect the ZIP and 7Z if the
signature hits on the compressed data, which it might, though
extracting the ZIP and 7Z will likely fail.
The reason the above doesn't happen now is that we restrict embedded
type scans for a bunch of archive formats to include GZ and TAR.
(B) If we scan for embedded files at the foo.tar layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| ├── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 2 | 1 |
| │ └── hello.txt | ASCII | 3 | 0 |
| └── sfx.7z | 7Z | 2 | 1 |
| └── world.txt | ASCII | 3 | 0 |
(B) is almost right. But we can achieve it easily enough only scanning for
embedded content in the current fmap when the "nested fmap level" is 0.
The upside is that it should safely detect all embedded content, even if
it may think the sfz.zip and sfx.7z are in foo.tar instead of in baz.exe.
The biggest risk I can think of affects ZIPs. SFXZIP detection
is identical to ZIP detection, which is why we don't allow SFXZIP to be
detected if insize of a ZIP. If we only allow embedded type scanning at
fmap-layer 0 in each buffer, this will fail to detect the embedded ZIP
if the bar.exe was not compressed in foo.zip and if non-compressed files
extracted from ZIPs aren't extracted as new buffers:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.zip | ZIP | 0 | 0 |
| └── bar.exe | PE | 1 | 1 |
| └── sfx.zip | ZIP | 2 | 2 |
Provided that we ensure all files extracted from zips are scanned in
new buffers, option (B) should be safe.
(C) If we scan for embedded files at the baz.exe layer, we may detect:
| description | type | rec level | nested fmap level |
| ------------------------- | ----- | --------- | ----------------- |
| foo.tar.gz | GZ | 0 | 0 |
| └── foo.tar | TAR | 1 | 0 |
| ├── bar.zip | ZIP | 2 | 1 |
| │ └── hola.txt | ASCII | 3 | 0 |
| └── baz.exe | PE | 2 | 1 |
| ├── sfx.zip | ZIP | 3 | 1 |
| │ └── hello.txt | ASCII | 4 | 0 |
| └── sfx.7z | 7Z | 3 | 1 |
| └── world.txt | ASCII | 4 | 0 |
(C) is right. But it's harder to achieve. For this example we can get it by
restricting 7ZSFX and ZIPSFX detection only when scanning an executable.
But that may mean losing detection of archives embedded elsewhere.
And we'd have to identify allowable container types for each possible
embedded type, which would be very difficult.
So this commit aims to solve the issue the (B)-way.
Note that in all situations, we still have to scan with file typing
enabled to determine if we need to reassign the current file type, such
as re-identifying a Bzip2 archive as a DMG that happens to be Bzip2-
compressed. Detection of DMG and a handful of other types rely on
finding data partway through or near the ned of a file before
reassigning the entire file as the new type.
Other fixes and considerations in this commit:
- The utf16 HTML parser has weak error handling, particularly with respect
to creating a nested fmap for scanning the ascii decoded file.
This commit cleans up the error handling and wraps the nested scan with
the recursion-stack push()/pop() for correct recursion tracking.
Before this commit, each container layer had a flag to indicate if the
container layer is valid.
We need something similar so that the cli_recursion_stack_get_*()
functions ignore normalized layers. Details...
Imagine an LDB signature for HTML content that specifies a ZIP
container. If the signature actually alerts on the normalized HTML and
you don't ignore normalized layers for the container check, it will
appear as though the alert is in an HTML container rather than a ZIP
container.
This commit accomplishes this with a boolean you set in the scan context
before scanning a new layer. Then when the new fmap is created, it will
use that flag to set similar flag for the layer. The context flag is
reset those that anything after this doesn't have that flag.
The flag allows the new recursion_stack_get() function to ignore
normalized layers when iterating the stack to return a layer at a
requested index, negative or positive.
Scanning normalized extracted/normalized javascript and VBA should also
use the 'layer is normalized' flag.
- This commit also fixes Heuristic.Broken.Executable alert for ELF files
to make sure that:
A) these only alert if cli_append_virus() returns CL_VIRUS (aka it
respects the FP check).
B) all broken-executable alerts for ELF only happen if the
SCAN_HEURISTIC_BROKEN option is enabled.
- This commit also cleans up the error handling in cli_magic_scan_dir().
This was needed so we could correctly apply the layer-is-normalized-flag
to all VBA macros extracted to a directory when scanning the directory.
- Also fix an issue where exceeding scan maximums wouldn't cause embedded
file detection scans to abort. Granted we don't actually want to abort
if max filesize or max recursion depth are exceeded... only if max
scansize, max files, and max scantime are exceeded.
Add 'abort_scan' flag to scan context, to protect against depending on
correct error propagation for fatal conditions. Instead, setting this
flag in the scan context should guarantee that a fatal condition deep in
scan recursion isn't lost which result in more stuff being scanned
instead of aborting. This shouldn't be necessary, but some status codes
like CL_ETIMEOUT never used to be fatal and it's easier to do this than
to verify every parser only returns CL_ETIMEOUT and other "fatal
status codes" in fatal conditions.
- Remove duplicate is_tar() prototype from filestypes.c and include
is_tar.h instead.
- Presently we create the fmap hash when creating the fmap.
This wastes a bit of CPU if the hash is never needed.
Now that we're creating fmap's for all embedded files discovered with
file type recognition scans, this is a much more frequent occurence and
really slows things down.
This commit fixes the issue by only creating fmap hashes as needed.
This should not only resolve the perfomance impact of creating fmap's
for all embedded files, but also should improve performance in general.
- Add allmatch check to the zip parser after the central-header meta
match. That way we don't multiple alerts with the same match except in
allmatch mode. Clean up error handling in the zip parser a tiny bit.
- Fixes to ensure that the scan limits such as scansize, filesize,
recursion depth, # of embedded files, and scantime are always reported
if AlertExceedsMax (--alert-exceeds-max) is enabled.
- Fixed an issue where non-fatal alerts for exceeding scan maximums may
mask signature matches later on. I changed it so these alerts use the
"possibly unwanted" alert-type and thus only alert if no other alerts
were found or if all-match or heuristic-precedence are enabled.
- Added the "Heuristics.Limits.Exceeded.*" events to the JSON metadata
when the --gen-json feature is enabled. These will show up once under
"ParseErrors" the first time a limit is exceeded. In the present
implementation, only one limits-exceeded events will be added, so as to
prevent a malicious or malformed sample from filling the JSON buffer
with millions of events and using a tonne of RAM.
4 years ago
|
|
|
ret = cli_elf_sh32(NULL, ctx->fmap, elfinfo, &(file_hdr.hdr32.hdr), conv);
|
|
|
|
}
|
PE, ELF, Mach-O: code cleanup
The header parsing / executable metadata collecting functions for the
PE, ELF, and Mach-O file types were using `int` for the return type.
Mostly they were returning 0 for success and -1, -2, -3, or -4 for
failure. But in some cases they were returning cl_error_t enum values
for failure. Regardless, the function using them was treating 0 as
success and non-zero as failure, which it stored as -1 ... every time.
This commit switches them all to use cl_error_t. I am continuing to
storeo the final result as 0 / -1 in the `peinfo` struct, but outside of
that everything has been made consistent.
While I was working on that, I got a tad side tracked. I noticed that
the target type isn't an enum, or even a set of #defines. So I made an
enum and then changed the code that uses target types to use the enum.
I also removed the `target` parameter from a number of functions that
don't actually use it at all. Some recursion was masking the fact that
it was an unused parameter which is why there was no warning about it.
3 years ago
|
|
|
if (ret != CL_SUCCESS) {
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
PE, ELF, Mach-O: code cleanup
The header parsing / executable metadata collecting functions for the
PE, ELF, and Mach-O file types were using `int` for the return type.
Mostly they were returning 0 for success and -1, -2, -3, or -4 for
failure. But in some cases they were returning cl_error_t enum values
for failure. Regardless, the function using them was treating 0 as
success and non-zero as failure, which it stored as -1 ... every time.
This commit switches them all to use cl_error_t. I am continuing to
storeo the final result as 0 / -1 in the `peinfo` struct, but outside of
that everything has been made consistent.
While I was working on that, I got a tad side tracked. I noticed that
the target type isn't an enum, or even a set of #defines. So I made an
enum and then changed the code that uses target types to use the enum.
I also removed the `target` parameter from a number of functions that
don't actually use it at all. Some recursion was masking the fact that
it was an unused parameter which is why there was no warning about it.
3 years ago
|
|
|
done:
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ELF file unpacking.
|
|
|
|
*/
|
|
|
|
cl_error_t cli_unpackelf(cli_ctx *ctx)
|
|
|
|
{
|
|
|
|
cl_error_t ret = CL_SUCCESS;
|
|
|
|
char *tempfile = NULL;
|
|
|
|
int ndesc = -1;
|
|
|
|
struct cli_bc_ctx *bc_ctx;
|
|
|
|
|
|
|
|
/* Bytecode BC_ELF_UNPACKER hook */
|
|
|
|
bc_ctx = cli_bytecode_context_alloc();
|
|
|
|
if (!bc_ctx) {
|
|
|
|
cli_errmsg("cli_scanelf: can't allocate memory for bc_ctx\n");
|
|
|
|
ret = CL_EMEM;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
cli_bytecode_context_setctx(bc_ctx, ctx);
|
|
|
|
|
|
|
|
cli_dbgmsg("Running bytecode hook\n");
|
|
|
|
ret = cli_bytecode_runhook(ctx, ctx->engine, bc_ctx, BC_ELF_UNPACKER, ctx->fmap);
|
|
|
|
cli_dbgmsg("Finished running bytecode hook\n");
|
|
|
|
if (CL_SUCCESS == ret) {
|
|
|
|
// check for unpacked/rebuilt executable
|
|
|
|
ndesc = cli_bytecode_context_getresult_file(bc_ctx, &tempfile);
|
|
|
|
if (ndesc != -1 && tempfile) {
|
|
|
|
cli_dbgmsg("cli_scanelf: Unpacked and rebuilt ELF executable saved in %s\n", tempfile);
|
|
|
|
|
|
|
|
lseek(ndesc, 0, SEEK_SET);
|
|
|
|
|
|
|
|
cli_dbgmsg("***** Scanning rebuilt ELF file *****\n");
|
|
|
|
ret = cli_magic_scan_desc(ndesc, tempfile, ctx, NULL, LAYER_ATTRIBUTES_NONE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
done:
|
|
|
|
// cli_bytecode_context_getresult_file() gives up ownership of temp file, so we must clean it up.
|
|
|
|
if (-1 != ndesc) {
|
|
|
|
close(ndesc);
|
|
|
|
}
|
|
|
|
if (NULL != tempfile) {
|
|
|
|
if (!ctx->engine->keeptmp) {
|
|
|
|
(void)cli_unlink(tempfile);
|
|
|
|
}
|
|
|
|
free(tempfile);
|
|
|
|
}
|
|
|
|
|
Fix bytecode hook out-file descriptor error handling
The cli_bc_ctx->outfd struct member was not properly initialized to -1.
Perhaps previous developers figured 0 was invalid-enough. All of the
checks for that file descriptor assumed 0 was the invalid value, going
so far as to explicitly set outfd to 0 if `open()` returned -1.
I didn't know this, so when I cleaned up the error handling in
`cli_unpackelf()` and `cli_unpackmacho()`, I had it `close(outfd)` when
not -1. That of course ended up closing stdin... and then all subsequent
file scans opened the file as fd `0`,... which interestingly caused
`read()` and `stat()` errors, but only after scanning a macho or elf
file, first.
Anyways... this commit fixes the issue by properly initializing outfd to
-1, and by changing any checks from 0 to -1.
I also found that it appears that the bytecode timeout wasn't being
applied to bytecode functions associated with logical signaures (that
is, those run by `cli_bytecode_runlsig()`).
What I see is that `ctx->bytecode_timeout` is only set to a non-zero
value in `cli_bytecode_context_alloc()`.
But for `cli_bytecode_runlsig()`, the bytecode context sits on the stack
and is memset instead. To resolve this, and ensure the bytecode context
is properly initialized, I created a new function that does this and
had it do the memset instead of using a calloc in the allocation
function.
I also removed the `bytecode_context_clear()` function because it simply
called `bytecode_context_reset()` and then did a memset. The memset is
unnecessary, especially since in most cases it's memsetting a stack
structure immediately before a return.
3 years ago
|
|
|
if (NULL != bc_ctx) {
|
|
|
|
cli_bytecode_context_destroy(bc_ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|