Store URLs from HTML when recording scan metadata json

Store URLs found in HTML `<a>` and `<form>` tags during scan of HTML files
when recording scan metadata.

HTML URL recording will be ON by default, but is a part of the
generate-metadata-json feature.
The generate-metadata-json feature is OFF by default.

This introduces a new general scan option:
- libclamav: `CL_SCAN_GENERAL_STORE_HTML_URLS`.
- ClamD: `JsonStoreHTMLUrls`.
- ClamScan: `--json-store-html-urls`

Thank you Matt Jolly for the helpful comment on the pull request.
pull/1281/head
Andy Ragusa 12 months ago committed by Micah Snyder
parent 8ae19eca40
commit 666e047f2b
No known key found for this signature in database
GPG Key ID: 3449E631914956D0
  1. 2
      clamscan/clamscan.c
  2. 4
      clamscan/manager.c
  3. 1
      common/optparser.c
  4. 6
      etc/clamd.conf.sample
  5. 1
      libclamav/clamav.h
  6. 6
      libclamav/hashtab.c
  7. 126
      libclamav/htmlnorm.c
  8. 9
      libclamav/htmlnorm.h
  9. 1
      libclamav/others.h
  10. 459
      libclamav/scanners.c
  11. 62
      unit_tests/clamscan/save_html_urls_test.py
  12. 16
      unit_tests/input/other_scanfiles/html/index.html
  13. 6
      win32/conf_examples/clamd.conf.sample

@ -254,6 +254,8 @@ void help(void)
mprintf(LOGG_INFO, " --gen-json[=yes/no(*)] Generate JSON metadata for the scanned file(s). For testing & development use ONLY.\n");
mprintf(LOGG_INFO, " JSON will be printed if --debug is enabled.\n");
mprintf(LOGG_INFO, " A JSON file will dropped to the temp directory if --leave-temps is enabled.\n");
mprintf(LOGG_INFO, " --json-store-html-urls[=yes(*)/no] Store html URLs in metadata.\n");
mprintf(LOGG_INFO, " URLs will be written to the metadata.json file in an array called 'HTMLUrls'\n");
mprintf(LOGG_INFO, " --database=FILE/DIR -d FILE/DIR Load virus database from FILE or load all supported db files from DIR\n");
mprintf(LOGG_INFO, " --official-db-only[=yes/no(*)] Only load official signatures\n");
mprintf(LOGG_INFO, " --fail-if-cvd-older-than=days Return with a nonzero error code if virus database outdated.\n");

@ -1574,6 +1574,10 @@ int scanmanager(const struct optstruct *opts)
options.general |= CL_SCAN_GENERAL_HEURISTICS;
}
if (optget(opts, "json-store-html-urls")->enabled) {
options.general |= CL_SCAN_GENERAL_STORE_HTML_URLS;
}
/* TODO: Remove deprecated option in a future feature release */
if ((optget(opts, "block-max")->enabled) ||
(optget(opts, "alert-exceeds-max")->enabled)) {

@ -389,6 +389,7 @@ const struct clam_option __clam_options[] = {
{"PhishingScanURLs", "phishing-scan-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Scan URLs found in mails for phishing attempts using heuristics.", "yes"},
{"HeuristicAlerts", "heuristic-alerts", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "In some cases (eg. complex malware, exploits in graphic files, and others),\nClamAV uses special algorithms to provide accurate detection. This option\ncontrols the algorithmic detection.", "yes"},
{"JsonStoreHTMLUrls", "json-store-html-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in HTML <form and <a tags.", "yes"},
{"HeuristicScanPrecedence", "heuristic-scan-precedence", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 0, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Allow heuristic match to take precedence.\nWhen enabled, if a heuristic scan (such as phishingScan) detects\na possible virus/phish it will stop scan immediately. Recommended, saves CPU\nscan-time.\nWhen disabled, virus/phish detected by heuristic scans will be reported only\nat the end of a scan. If an archive contains both a heuristically detected\nvirus/phish, and a real malware, the real malware will be reported.\nKeep this disabled if you intend to handle \"Heuristics.*\" viruses\ndifferently from \"real\" malware.\nIf a non-heuristically-detected virus (signature-based) is found first,\nthe scan is interrupted immediately, regardless of this config option.", "yes"},

@ -254,6 +254,12 @@ Example
# Default: no
#GenerateMetadataJson yes
# Store URLs found in html files to the json metadata.
# URLs will be stored in an array with the tag 'HTMLUrls'
# GenerateMetadataJson is required for this feature.
# Default: yes (if GenerateMetadataJson is used)
#JsonStoreHTMLUrls no
# Permit use of the ALLMATCHSCAN command. If set to no, clamd will reject
# any ALLMATCHSCAN command as invalid.
# Default: yes

@ -168,6 +168,7 @@ struct cl_scan_options {
#define CL_SCAN_GENERAL_HEURISTICS 0x4 /* option to enable heuristic alerts */
#define CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE 0x8 /* allow heuristic match to take precedence. */
#define CL_SCAN_GENERAL_UNPRIVILEGED 0x10 /* scanner will not have read access to files. */
#define CL_SCAN_GENERAL_STORE_HTML_URLS 0x20 /* Store urls found in html <a and <form tags when recording JSON metadata */
/* parsing capabilities options */
#define CL_SCAN_PARSE_ARCHIVE 0x1

@ -719,9 +719,9 @@ void cli_hashset_destroy(struct cli_hashset *hs)
hs->capacity = 0;
}
#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & ((uint64_t)1 << ((val) & 0x1f)))
#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= ((uint64_t)1 << ((val) & 0x1f)))
#define BITMAP_REMOVE(bmap, val) ((bmap)[(val) >> 5] &= ~((uint64_t)1 << ((val) & 0x1f)))
#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & ((uint64_t)1 << ((val)&0x1f)))
#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= ((uint64_t)1 << ((val)&0x1f)))
#define BITMAP_REMOVE(bmap, val) ((bmap)[(val) >> 5] &= ~((uint64_t)1 << ((val)&0x1f)))
/*
* searches the hashset for the @key.

@ -370,51 +370,70 @@ void html_tag_arg_add(tag_arguments_t *tags,
const char *tag, char *value)
{
int len, i;
tags->count++;
tags->tag = (unsigned char **)cli_max_realloc_or_free(tags->tag,
tags->count * sizeof(char *));
if (!tags->tag) {
int tagCnt = tags->count;
int valueCnt = tags->count;
int contentCnt = 0;
unsigned char **tmp = NULL;
tmp = (unsigned char **)cli_max_realloc(tags->tag, (tagCnt + 1) * sizeof(char *));
if (!tmp) {
goto done;
}
tags->value = (unsigned char **)cli_max_realloc_or_free(tags->value,
tags->count * sizeof(char *));
if (!tags->value) {
tags->tag = tmp;
tagCnt++;
tmp = (unsigned char **)cli_max_realloc(tags->value, (valueCnt + 1) * sizeof(char *));
if (!tmp) {
goto done;
}
tags->value = tmp;
valueCnt++;
if (tags->scanContents) {
tags->contents = (unsigned char **)cli_max_realloc_or_free(tags->contents,
tags->count * sizeof(*tags->contents));
if (!tags->contents) {
contentCnt = tags->count;
tmp = (unsigned char **)cli_max_realloc(tags->contents, (contentCnt + 1) * sizeof(*tags->contents));
if (!tmp) {
goto done;
}
tags->contents[tags->count - 1] = NULL;
tags->contents = tmp;
tags->contents[contentCnt] = NULL;
contentCnt++;
}
tags->tag[tags->count - 1] = (unsigned char *)cli_safer_strdup(tag);
tags->tag[tags->count] = (unsigned char *)cli_safer_strdup(tag);
if (value) {
if (*value == '"') {
tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value + 1);
len = strlen((const char *)value + 1);
tags->value[tags->count] = (unsigned char *)cli_safer_strdup(value + 1);
if (NULL == tags->value[tags->count]) {
goto done;
}
len = strlen((const char *)value + 1);
if (len > 0) {
tags->value[tags->count - 1][len - 1] = '\0';
tags->value[tags->count][len - 1] = '\0';
}
} else {
tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value);
tags->value[tags->count] = (unsigned char *)cli_safer_strdup(value);
}
} else {
tags->value[tags->count - 1] = NULL;
tags->value[tags->count] = NULL;
}
tags->count++;
return;
done:
/* Bad error - can't do 100% recovery */
tags->count--;
for (i = 0; i < tags->count; i++) {
for (i = 0; i < tagCnt; i++) {
if (tags->tag) {
free(tags->tag[i]);
}
}
for (i = 0; i < valueCnt; i++) {
if (tags->value) {
free(tags->value[i]);
}
}
for (i = 0; i < contentCnt; i++) {
if (tags->contents) {
if (tags->contents[i])
free(tags->contents[i]);
@ -649,7 +668,46 @@ static void js_process(struct parser_state *js_state, const unsigned char *js_be
}
}
static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
bool html_insert_form_data(const char *const value, form_data_t *tags)
{
bool bRet = false;
size_t cnt = tags->count + 1;
char **tmp = NULL;
/*
* Do NOT use cli_max_realloc_or_free because all the previously malloc'd tag
* values will be leaked when tag is free'd in the case where realloc fails.
*/
tmp = cli_max_realloc(tags->urls, cnt * sizeof(unsigned char *));
if (!tmp) {
goto done;
}
tags->urls = tmp;
tags->urls[tags->count] = cli_safer_strdup(value);
if (tags->urls[tags->count]) {
tags->count = cnt;
}
bRet = true;
done:
if (!bRet) {
memset(tags, 0, sizeof(*tags));
}
return bRet;
}
void html_form_data_tag_free(form_data_t *tags)
{
size_t i;
for (i = 0; i < tags->count; i++) {
CLI_FREE_AND_SET_NULL(tags->urls[i]);
}
CLI_FREE_AND_SET_NULL(tags->urls);
}
static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
{
int fd_tmp, tag_length = 0, tag_arg_length = 0;
bool binary, retval = false, escape = false, hex = false;
@ -659,7 +717,7 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
FILE *stream_in = NULL;
html_state state = HTML_NORM, next_state = HTML_BAD_STATE, saved_next_state = HTML_BAD_STATE;
char filename[1024], tag[HTML_STR_LENGTH + 1], tag_arg[HTML_STR_LENGTH + 1];
char tag_val[HTML_STR_LENGTH + 1], *tmp_file, *arg_value;
char tag_val[HTML_STR_LENGTH + 1], *tmp_file = NULL, *arg_value = NULL;
unsigned char *line = NULL, *ptr, *ptr_screnc = NULL;
tag_arguments_t tag_args;
quoted_state quoted = NOT_QUOTED;
@ -1224,8 +1282,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
href_contents_begin = ptr;
}
if (strcmp(tag, "/form") == 0) {
if (in_form_action)
if (in_form_action) {
free(in_form_action);
}
in_form_action = NULL;
}
} else if (strcmp(tag, "script") == 0) {
@ -1310,9 +1369,13 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
} else if (strcmp(tag, "form") == 0 && hrefs->scanContents) {
const char *arg_action_value = html_tag_arg_value(&tag_args, "action");
if (arg_action_value) {
if (in_form_action)
if (in_form_action) {
free(in_form_action);
}
in_form_action = (unsigned char *)cli_safer_strdup(arg_action_value);
if (form_data) {
html_insert_form_data((const char *const)in_form_action, form_data);
}
}
} else if (strcmp(tag, "img") == 0) {
arg_value = html_tag_arg_value(&tag_args, "src");
@ -1917,8 +1980,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
done:
if (line) /* only needed for done case */
free(line);
if (in_form_action)
if (in_form_action) {
free(in_form_action);
}
if (in_ahref) /* tag not closed, force closing */
html_tag_contents_done(hrefs, in_ahref, &contents);
@ -1960,6 +2024,11 @@ done:
}
bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
{
return html_normalise_mem_form_data(ctx, in_buff, in_size, dirname, hrefs, dconf, NULL);
}
bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
{
m_area_t m_area;
@ -1968,10 +2037,15 @@ bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, con
m_area.offset = 0;
m_area.map = NULL;
return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);
return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data);
}
bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
{
return html_normalise_map_form_data(ctx, map, dirname, hrefs, dconf, NULL);
}
bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
{
bool retval = false;
m_area_t m_area;
@ -1979,7 +2053,7 @@ bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_argu
m_area.length = map->len;
m_area.offset = 0;
m_area.map = map;
retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);
retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data);
return retval;
}

@ -45,10 +45,19 @@ typedef struct m_area_tag {
fmap_t *map;
} m_area_t;
typedef struct form_data_tag {
char **urls;
size_t count;
} form_data_t;
bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf);
bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data);
bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf);
bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data);
void html_tag_arg_free(tag_arguments_t *tags);
bool html_screnc_decode(fmap_t *map, const char *dirname);
void html_tag_arg_add(tag_arguments_t *tags, const char *tag, char *value);
void html_form_data_tag_free(form_data_t *tags);
#endif

@ -552,6 +552,7 @@ extern LIBCLAMAV_EXPORT int have_rar;
#define SCAN_HEURISTICS (ctx->options->general & CL_SCAN_GENERAL_HEURISTICS)
#define SCAN_HEURISTIC_PRECEDENCE (ctx->options->general & CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE)
#define SCAN_UNPRIVILEGED (ctx->options->general & CL_SCAN_GENERAL_UNPRIVILEGED)
#define SCAN_STORE_HTML_URLS (ctx->options->general & CL_SCAN_GENERAL_STORE_HTML_URLS)
#define SCAN_PARSE_ARCHIVE (ctx->options->parse & CL_SCAN_PARSE_ARCHIVE)
#define SCAN_PARSE_ELF (ctx->options->parse & CL_SCAN_PARSE_ELF)

@ -2082,6 +2082,452 @@ done:
return ret;
}
const char *const HTML_URLS_JSON_KEY = "HTMLUrls";
/* https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml */
const char *URI_LIST[] = {
"aaa://",
"aaas://",
"about://",
"acap://",
"acct://",
"acd://",
"acr://",
"adiumxtra://",
"adt://",
"afp://",
"afs://",
"aim://",
"amss://",
"android://",
"appdata://",
"apt://",
"ar://",
"ark://",
"at://",
"attachment://",
"aw://",
"barion://",
"bb://",
"beshare://",
"bitcoin://",
"bitcoincash://",
"blob://",
"bolo://",
"brid://",
"browserext://",
"cabal://",
"calculator://",
"callto://",
"cap://",
"cast://",
"casts://",
"chrome://",
"chrome-extension://",
"cid://",
"coap://",
"coap+tcp://",
"coap+ws://",
"coaps://",
"coaps+tcp://",
"coaps+ws://",
"com-eventbrite-attendee://",
"content://",
"content-type://",
"crid://",
"cstr://",
"cvs://",
"dab://",
"dat://",
"data://",
"dav://",
"dhttp://",
"diaspora://",
"dict://",
"did://",
"dis://",
"dlna-playcontainer://",
"dlna-playsingle://",
"dns://",
"dntp://",
"doi://",
"dpp://",
"drm://",
"drop://",
"dtmi://",
"dtn://",
"dvb://",
"dvx://",
"dweb://",
"ed2k://",
"eid://",
"elsi://",
"embedded://",
"ens://",
"ethereum://",
"example://",
"facetime://",
"fax://",
"feed://",
"feedready://",
"fido://",
"file://",
"filesystem://",
"finger://",
"first-run-pen-experience://",
"fish://",
"fm://",
"ftp://",
"fuchsia-pkg://",
"geo://",
"gg://",
"git://",
"gitoid://",
"gizmoproject://",
"go://",
"gopher://",
"graph://",
"grd://",
"gtalk://",
"h323://",
"ham://",
"hcap://",
"hcp://",
"hs20://",
"http://",
"https://",
"hxxp://",
"hxxps://",
"hydrazone://",
"hyper://",
"iax://",
"icap://",
"icon://",
"im://",
"imap://",
"info://",
"iotdisco://",
"ipfs://",
"ipn://",
"ipns://",
"ipp://",
"ipps://",
"irc://",
"irc6://",
"ircs://",
"iris://",
"iris.beep://",
"iris.lwz://",
"iris.xpc://",
"iris.xpcs://",
"isostore://",
"itms://",
"jabber://",
"jar://",
"jms://",
"keyparc://",
"lastfm://",
"lbry://",
"ldap://",
"ldaps://",
"leaptofrogans://",
"lid://",
"lorawan://",
"lpa://",
"lvlt://",
"machineProvisioningProgressReporter://",
"magnet://",
"mailserver://",
"mailto://",
"maps://",
"market://",
"matrix://",
"message://",
"microsoft.windows.camera://",
"microsoft.windows.camera.multipicker://",
"microsoft.windows.camera.picker://",
"mid://",
"mms://",
"modem://",
"mongodb://",
"moz://",
"ms-access://",
"ms-appinstaller://",
"ms-browser-extension://",
"ms-calculator://",
"ms-drive-to://",
"ms-enrollment://",
"ms-excel://",
"ms-eyecontrolspeech://",
"ms-gamebarservices://",
"ms-gamingoverlay://",
"ms-getoffice://",
"ms-help://",
"ms-infopath://",
"ms-inputapp://",
"ms-launchremotedesktop://",
"ms-lockscreencomponent-config://",
"ms-media-stream-id://",
"ms-meetnow://",
"ms-mixedrealitycapture://",
"ms-mobileplans://",
"ms-newsandinterests://",
"ms-officeapp://",
"ms-people://",
"ms-project://",
"ms-powerpoint://",
"ms-publisher://",
"ms-recall://",
"ms-remotedesktop://",
"ms-remotedesktop-launch://",
"ms-restoretabcompanion://",
"ms-screenclip://",
"ms-screensketch://",
"ms-search://",
"ms-search-repair://",
"ms-secondary-screen-controller://",
"ms-secondary-screen-setup://",
"ms-settings://",
"ms-settings-airplanemode://",
"ms-settings-bluetooth://",
"ms-settings-camera://",
"ms-settings-cellular://",
"ms-settings-cloudstorage://",
"ms-settings-connectabledevices://",
"ms-settings-displays-topology://",
"ms-settings-emailandaccounts://",
"ms-settings-language://",
"ms-settings-location://",
"ms-settings-lock://",
"ms-settings-nfctransactions://",
"ms-settings-notifications://",
"ms-settings-power://",
"ms-settings-privacy://",
"ms-settings-proximity://",
"ms-settings-screenrotation://",
"ms-settings-wifi://",
"ms-settings-workplace://",
"ms-spd://",
"ms-stickers://",
"ms-sttoverlay://",
"ms-transit-to://",
"ms-useractivityset://",
"ms-virtualtouchpad://",
"ms-visio://",
"ms-walk-to://",
"ms-whiteboard://",
"ms-whiteboard-cmd://",
"ms-word://",
"msnim://",
"msrp://",
"msrps://",
"mss://",
"mt://",
"mtqp://",
"mumble://",
"mupdate://",
"mvn://",
"mvrp://",
"mvrps://",
"news://",
"nfs://",
"ni://",
"nih://",
"nntp://",
"notes://",
"num://",
"ocf://",
"oid://",
"onenote://",
"onenote-cmd://",
"opaquelocktoken://",
"openid://",
"openpgp4fpr://",
"otpauth://",
"p1://",
"pack://",
"palm://",
"paparazzi://",
"payment://",
"payto://",
"pkcs11://",
"platform://",
"pop://",
"pres://",
"prospero://",
"proxy://",
"pwid://",
"psyc://",
"pttp://",
"qb://",
"query://",
"quic-transport://",
"redis://",
"rediss://",
"reload://",
"res://",
"resource://",
"rmi://",
"rsync://",
"rtmfp://",
"rtmp://",
"rtsp://",
"rtsps://",
"rtspu://",
"sarif://",
"secondlife://",
"secret-token://",
"service://",
"session://",
"sftp://",
"sgn://",
"shc://",
"shttp://",
"sieve://",
"simpleledger://",
"simplex://",
"sip://",
"sips://",
"skype://",
"smb://",
"smp://",
"sms://",
"smtp://",
"snews://",
"snmp://",
"soap.beep://",
"soap.beeps://",
"soldat://",
"spiffe://",
"spotify://",
"ssb://",
"ssh://",
"starknet://",
"steam://",
"stun://",
"stuns://",
"submit://",
"svn://",
"swh://",
"swid://",
"swidpath://",
"tag://",
"taler://",
"teamspeak://",
"tel://",
"teliaeid://",
"telnet://",
"tftp://",
"things://",
"thismessage://",
"tip://",
"tn3270://",
"tool://",
"turn://",
"turns://",
"tv://",
"udp://",
"unreal://",
"upt://",
"urn://",
"ut2004://",
"uuid-in-package://",
"v-event://",
"vemmi://",
"ventrilo://",
"ves://",
"videotex://",
"vnc://",
"view-source://",
"vscode://",
"vscode-insiders://",
"vsls://",
"w3://",
"wais://",
"web3://",
"wcr://",
"webcal://",
"web+ap://",
"wifi://",
"wpid://",
"ws://",
"wss://",
"wtai://",
"wyciwyg://",
"xcon://",
"xcon-userid://",
"xfire://",
"xmlrpc.beep://",
"xmlrpc.beeps://",
"xmpp://",
"xftp://",
"xrcp://",
"xri://",
"ymsgr://",
"z39.50://",
"z39.50r://",
"z39.50s://"};
static bool is_url(const char *const str, size_t str_len)
{
bool bRet = false;
size_t i;
for (i = 0; i < sizeof(URI_LIST) / sizeof(URI_LIST[0]); i++) {
if (str && (str_len > strlen(URI_LIST[i])) && (0 == strncasecmp(str, URI_LIST[i], strlen(URI_LIST[i])))) {
bRet = true;
goto done;
}
}
done:
return bRet;
}
static void save_urls(cli_ctx *ctx, tag_arguments_t *hrefs, form_data_t *form_data)
{
int i = 0;
json_object *ary = NULL;
if (NULL == hrefs) {
return;
}
if (ctx->wrkproperty != ctx->properties) {
return;
}
if (!(SCAN_STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL))) {
return;
}
/*Add hrefs*/
for (i = 0; i < hrefs->count; i++) {
if (is_url((const char *)hrefs->value[i], strlen((const char *)hrefs->value[i]))) {
if (NULL == ary) {
ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY);
if (!ary) {
cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY);
return;
}
}
cli_jsonstr(ary, NULL, (const char *)hrefs->value[i]);
}
}
/*Add form_data*/
for (i = 0; i < (int)form_data->count; i++) {
if (is_url((const char *)form_data->urls[i], strlen((const char *)form_data->urls[i]))) {
if (NULL == ary) {
ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY);
if (!ary) {
cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY);
return;
}
}
cli_jsonstr(ary, NULL, (const char *)form_data->urls[i]);
}
}
}
static cl_error_t cli_scanhtml(cli_ctx *ctx)
{
cl_error_t status = CL_SUCCESS;
@ -2113,7 +2559,18 @@ static cl_error_t cli_scanhtml(cli_ctx *ctx)
cli_dbgmsg("cli_scanhtml: using tempdir %s\n", tempname);
(void)html_normalise_map(ctx, map, tempname, NULL, ctx->dconf);
/* Output JSON Summary Information */
if (SCAN_STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL)) {
tag_arguments_t hrefs = {0};
hrefs.scanContents = 1;
form_data_t form_data = {0};
(void)html_normalise_map_form_data(ctx, map, tempname, &hrefs, ctx->dconf, &form_data);
save_urls(ctx, &hrefs, &form_data);
html_tag_arg_free(&hrefs);
html_form_data_tag_free(&form_data);
} else {
(void)html_normalise_map(ctx, map, tempname, NULL, ctx->dconf);
}
snprintf(fullname, 1024, "%s" PATHSEP "nocomment.html", tempname);
fd = open(fullname, O_RDONLY | O_BINARY);

@ -0,0 +1,62 @@
# Copyright (C) 2020-2024 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
"""
Run clamscan tests.
"""
import sys
import os
import re
import shutil
sys.path.append('../unit_tests')
import testcase
class TC(testcase.TestCase):
@classmethod
def setUpClass(cls):
super(TC, cls).setUpClass()
@classmethod
def tearDownClass(cls):
super(TC, cls).tearDownClass()
def setUp(self):
super(TC, self).setUp()
def tearDown(self):
super(TC, self).tearDown()
# Remove scan temps directory between tests
if (self.path_tmp / "TD").exists():
shutil.rmtree(self.path_tmp / "TD")
self.verify_valgrind_log()
def test_save_links(self):
self.step_name('Extract Links')
tempdir=self.path_tmp / "TD"
if not os.path.isdir(tempdir):
os.makedirs(tempdir);
testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'html' / 'index.html'
command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format(
valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb',
tempdir=tempdir,
testfile=testfile,
)
output = self.execute_command(command)
assert output.ec == 0 # clean
expected_strings = [ 'HTMLUrls'
, '"https://www.clamav.net/reports/malware"'
, '"http://www.google.com"'
]
self.verify_metadata_json(tempdir, expected_strings)

@ -0,0 +1,16 @@
<!DOCTYPE html>
<html>
<body>
<h1>Save Links Unittest</h1>
<p>Paragraph</p>
<a href="https://www.clamav.net/reports/malware">Report Malware</a>
<form action="http://www.google.com">
<input type="submit">
</form>
</body>
</html>

@ -226,6 +226,12 @@ TCPAddr localhost
# Default: no
#GenerateMetadataJson yes
# Store URLs found in html files to the json metadata.
# URLs will be stored in an array with the tag 'HTMLUrls'
# GenerateMetadataJson is required for this feature.
# Default: yes (if GenerateMetadataJson is used)
#JsonStoreHTMLUrls no
# Permit use of the ALLMATCHSCAN command. If set to no, clamd will reject
# any ALLMATCHSCAN command as invalid.
# Default: yes

Loading…
Cancel
Save