Store URLs from HTML when recording scan metadata json

Store URLs found in HTML `<a>` and `<form>` tags during scan of HTML files when recording scan metadata. HTML URL recording will be ON by default, but is a part of the generate-metadata-json feature. The generate-metadata-json feature is OFF by default. This introduces a new general scan option: - libclamav: `CL_SCAN_GENERAL_STORE_HTML_URLS`. - ClamD: `JsonStoreHTMLUrls`. - ClamScan: `--json-store-html-urls` Thank you Matt Jolly for the helpful comment on the pull request.
12 months ago · 666e047f2b
parent 8ae19eca40
commit 666e047f2b
13 changed files with 669 additions and 30 deletions
--- a/clamscan/clamscan.c
+++ b/clamscan/clamscan.c
@ -254,6 +254,8 @@ void help(void)
    mprintf(LOGG_INFO, "    --gen-json[=yes/no(*)]               Generate JSON metadata for the scanned file(s). For testing & development use ONLY.\n");
    mprintf(LOGG_INFO, "                                         JSON will be printed if --debug is enabled.\n");
    mprintf(LOGG_INFO, "                                         A JSON file will dropped to the temp directory if --leave-temps is enabled.\n");
+    mprintf(LOGG_INFO, "    --json-store-html-urls[=yes(*)/no]   Store html URLs in metadata.\n");
+    mprintf(LOGG_INFO, "                                         URLs will be written to the metadata.json file in an array called 'HTMLUrls'\n");
    mprintf(LOGG_INFO, "    --database=FILE/DIR   -d FILE/DIR    Load virus database from FILE or load all supported db files from DIR\n");
    mprintf(LOGG_INFO, "    --official-db-only[=yes/no(*)]       Only load official signatures\n");
    mprintf(LOGG_INFO, "    --fail-if-cvd-older-than=days        Return with a nonzero error code if virus database outdated.\n");
--- a/clamscan/manager.c
+++ b/clamscan/manager.c
@ -1574,6 +1574,10 @@ int scanmanager(const struct optstruct *opts)
        options.general |= CL_SCAN_GENERAL_HEURISTICS;
    }

+    if (optget(opts, "json-store-html-urls")->enabled) {
+        options.general |= CL_SCAN_GENERAL_STORE_HTML_URLS;
+    }
+
    /* TODO: Remove deprecated option in a future feature release */
    if ((optget(opts, "block-max")->enabled) ||
        (optget(opts, "alert-exceeds-max")->enabled)) {
--- a/common/optparser.c
+++ b/common/optparser.c
@ -389,6 +389,7 @@ const struct clam_option __clam_options[] = {
    {"PhishingScanURLs", "phishing-scan-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Scan URLs found in mails for phishing attempts using heuristics.", "yes"},

    {"HeuristicAlerts", "heuristic-alerts", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "In some cases (eg. complex malware, exploits in graphic files, and others),\nClamAV uses special algorithms to provide accurate detection. This option\ncontrols the algorithmic detection.", "yes"},
+    {"JsonStoreHTMLUrls", "json-store-html-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in HTML <form and <a tags.", "yes"},

    {"HeuristicScanPrecedence", "heuristic-scan-precedence", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 0, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Allow heuristic match to take precedence.\nWhen enabled, if a heuristic scan (such as phishingScan) detects\na possible virus/phish it will stop scan immediately. Recommended, saves CPU\nscan-time.\nWhen disabled, virus/phish detected by heuristic scans will be reported only\nat the end of a scan. If an archive contains both a heuristically detected\nvirus/phish, and a real malware, the real malware will be reported.\nKeep this disabled if you intend to handle \"Heuristics.*\" viruses\ndifferently from \"real\" malware.\nIf a non-heuristically-detected virus (signature-based) is found first,\nthe scan is interrupted immediately, regardless of this config option.", "yes"},

--- a/etc/clamd.conf.sample
+++ b/etc/clamd.conf.sample
@ -254,6 +254,12 @@ Example
 # Default: no
 #GenerateMetadataJson yes

+# Store URLs found in html files to the json metadata.
+# URLs will be stored in an array with the tag 'HTMLUrls'
+# GenerateMetadataJson is required for this feature.
+# Default: yes (if GenerateMetadataJson is used)
+#JsonStoreHTMLUrls no
+
 # Permit use of the ALLMATCHSCAN command. If set to no, clamd will reject
 # any ALLMATCHSCAN command as invalid.
 # Default: yes
--- a/libclamav/clamav.h
+++ b/libclamav/clamav.h
@ -168,6 +168,7 @@ struct cl_scan_options {
 #define CL_SCAN_GENERAL_HEURISTICS                  0x4  /* option to enable heuristic alerts */
 #define CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE        0x8  /* allow heuristic match to take precedence. */
 #define CL_SCAN_GENERAL_UNPRIVILEGED                0x10 /* scanner will not have read access to files. */
+#define CL_SCAN_GENERAL_STORE_HTML_URLS             0x20 /* Store urls found in html <a and <form tags when recording JSON metadata */

 /* parsing capabilities options */
 #define CL_SCAN_PARSE_ARCHIVE                       0x1
--- a/libclamav/hashtab.c
+++ b/libclamav/hashtab.c
@ -719,9 +719,9 @@ void cli_hashset_destroy(struct cli_hashset *hs)
    hs->capacity          = 0;
 }

-#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & ((uint64_t)1 << ((val) & 0x1f)))
-#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= ((uint64_t)1 << ((val) & 0x1f)))
-#define BITMAP_REMOVE(bmap, val) ((bmap)[(val) >> 5] &= ~((uint64_t)1 << ((val) & 0x1f)))
+#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & ((uint64_t)1 << ((val)&0x1f)))
+#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= ((uint64_t)1 << ((val)&0x1f)))
+#define BITMAP_REMOVE(bmap, val) ((bmap)[(val) >> 5] &= ~((uint64_t)1 << ((val)&0x1f)))

 /*
 * searches the hashset for the @key.
--- a/libclamav/htmlnorm.c
+++ b/libclamav/htmlnorm.c
@ -370,51 +370,70 @@ void html_tag_arg_add(tag_arguments_t *tags,
                      const char *tag, char *value)
 {
    int len, i;
-    tags->count++;
-    tags->tag = (unsigned char **)cli_max_realloc_or_free(tags->tag,
-                                                          tags->count * sizeof(char *));
-    if (!tags->tag) {
+    int tagCnt          = tags->count;
+    int valueCnt        = tags->count;
+    int contentCnt      = 0;
+    unsigned char **tmp = NULL;
+
+    tmp = (unsigned char **)cli_max_realloc(tags->tag, (tagCnt + 1) * sizeof(char *));
+    if (!tmp) {
        goto done;
    }
-    tags->value = (unsigned char **)cli_max_realloc_or_free(tags->value,
-                                                            tags->count * sizeof(char *));
-    if (!tags->value) {
+    tags->tag = tmp;
+    tagCnt++;
+
+    tmp = (unsigned char **)cli_max_realloc(tags->value, (valueCnt + 1) * sizeof(char *));
+    if (!tmp) {
        goto done;
    }
+    tags->value = tmp;
+    valueCnt++;
+
    if (tags->scanContents) {
-        tags->contents = (unsigned char **)cli_max_realloc_or_free(tags->contents,
-                                                                   tags->count * sizeof(*tags->contents));
-        if (!tags->contents) {
+        contentCnt = tags->count;
+        tmp        = (unsigned char **)cli_max_realloc(tags->contents, (contentCnt + 1) * sizeof(*tags->contents));
+        if (!tmp) {
            goto done;
        }
-        tags->contents[tags->count - 1] = NULL;
+        tags->contents             = tmp;
+        tags->contents[contentCnt] = NULL;
+        contentCnt++;
    }
-    tags->tag[tags->count - 1] = (unsigned char *)cli_safer_strdup(tag);
+
+    tags->tag[tags->count] = (unsigned char *)cli_safer_strdup(tag);
    if (value) {
        if (*value == '"') {
-            tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value + 1);
-            len                          = strlen((const char *)value + 1);
+            tags->value[tags->count] = (unsigned char *)cli_safer_strdup(value + 1);
+            if (NULL == tags->value[tags->count]) {
+                goto done;
+            }
+            len = strlen((const char *)value + 1);
            if (len > 0) {
-                tags->value[tags->count - 1][len - 1] = '\0';
+                tags->value[tags->count][len - 1] = '\0';
            }
        } else {
-            tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value);
+            tags->value[tags->count] = (unsigned char *)cli_safer_strdup(value);
        }
    } else {
-        tags->value[tags->count - 1] = NULL;
+        tags->value[tags->count] = NULL;
    }
+
+    tags->count++;
    return;

 done:
    /* Bad error - can't do 100% recovery */
-    tags->count--;
-    for (i = 0; i < tags->count; i++) {
+    for (i = 0; i < tagCnt; i++) {
        if (tags->tag) {
            free(tags->tag[i]);
        }
+    }
+    for (i = 0; i < valueCnt; i++) {
        if (tags->value) {
            free(tags->value[i]);
        }
+    }
+    for (i = 0; i < contentCnt; i++) {
        if (tags->contents) {
            if (tags->contents[i])
                free(tags->contents[i]);
@ -649,7 +668,46 @@ static void js_process(struct parser_state *js_state, const unsigned char *js_be
    }
 }

-static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
+bool html_insert_form_data(const char *const value, form_data_t *tags)
+{
+    bool bRet  = false;
+    size_t cnt = tags->count + 1;
+    char **tmp = NULL;
+
+    /*
+     * Do NOT use cli_max_realloc_or_free because all the previously malloc'd tag
+     * values will be leaked when tag is free'd in the case where realloc fails.
+     */
+    tmp = cli_max_realloc(tags->urls, cnt * sizeof(unsigned char *));
+    if (!tmp) {
+        goto done;
+    }
+    tags->urls = tmp;
+
+    tags->urls[tags->count] = cli_safer_strdup(value);
+    if (tags->urls[tags->count]) {
+        tags->count = cnt;
+    }
+
+    bRet = true;
+done:
+    if (!bRet) {
+        memset(tags, 0, sizeof(*tags));
+    }
+
+    return bRet;
+}
+
+void html_form_data_tag_free(form_data_t *tags)
+{
+    size_t i;
+    for (i = 0; i < tags->count; i++) {
+        CLI_FREE_AND_SET_NULL(tags->urls[i]);
+    }
+    CLI_FREE_AND_SET_NULL(tags->urls);
+}
+
+static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
 {
    int fd_tmp, tag_length = 0, tag_arg_length = 0;
    bool binary, retval = false, escape = false, hex = false;
@ -659,7 +717,7 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
    FILE *stream_in  = NULL;
    html_state state = HTML_NORM, next_state = HTML_BAD_STATE, saved_next_state = HTML_BAD_STATE;
    char filename[1024], tag[HTML_STR_LENGTH + 1], tag_arg[HTML_STR_LENGTH + 1];
-    char tag_val[HTML_STR_LENGTH + 1], *tmp_file, *arg_value;
+    char tag_val[HTML_STR_LENGTH + 1], *tmp_file = NULL, *arg_value = NULL;
    unsigned char *line = NULL, *ptr, *ptr_screnc = NULL;
    tag_arguments_t tag_args;
    quoted_state quoted  = NOT_QUOTED;
@ -1224,8 +1282,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
                            href_contents_begin = ptr;
                        }
                        if (strcmp(tag, "/form") == 0) {
-                            if (in_form_action)
+                            if (in_form_action) {
                                free(in_form_action);
+                            }
                            in_form_action = NULL;
                        }
                    } else if (strcmp(tag, "script") == 0) {
@ -1310,9 +1369,13 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
                        } else if (strcmp(tag, "form") == 0 && hrefs->scanContents) {
                            const char *arg_action_value = html_tag_arg_value(&tag_args, "action");
                            if (arg_action_value) {
-                                if (in_form_action)
+                                if (in_form_action) {
                                    free(in_form_action);
+                                }
                                in_form_action = (unsigned char *)cli_safer_strdup(arg_action_value);
+                                if (form_data) {
+                                    html_insert_form_data((const char *const)in_form_action, form_data);
+                                }
                            }
                        } else if (strcmp(tag, "img") == 0) {
                            arg_value = html_tag_arg_value(&tag_args, "src");
@ -1917,8 +1980,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
 done:
    if (line) /* only needed for done case */
        free(line);
-    if (in_form_action)
+    if (in_form_action) {
        free(in_form_action);
+    }
    if (in_ahref) /* tag not closed, force closing */
        html_tag_contents_done(hrefs, in_ahref, &contents);

@ -1960,6 +2024,11 @@ done:
 }

 bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
+{
+    return html_normalise_mem_form_data(ctx, in_buff, in_size, dirname, hrefs, dconf, NULL);
+}
+
+bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
 {
    m_area_t m_area;

@ -1968,10 +2037,15 @@ bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, con
    m_area.offset = 0;
    m_area.map    = NULL;

-    return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);
+    return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data);
 }

 bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
+{
+    return html_normalise_map_form_data(ctx, map, dirname, hrefs, dconf, NULL);
+}
+
+bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
 {
    bool retval = false;
    m_area_t m_area;
@ -1979,7 +2053,7 @@ bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_argu
    m_area.length = map->len;
    m_area.offset = 0;
    m_area.map    = map;
-    retval        = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);
+    retval        = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data);
    return retval;
 }

--- a/libclamav/htmlnorm.h
+++ b/libclamav/htmlnorm.h
@ -45,10 +45,19 @@ typedef struct m_area_tag {
    fmap_t *map;
 } m_area_t;

+typedef struct form_data_tag {
+    char **urls;
+    size_t count;
+} form_data_t;
+
 bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf);
+bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data);
 bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf);
+bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data);
 void html_tag_arg_free(tag_arguments_t *tags);
 bool html_screnc_decode(fmap_t *map, const char *dirname);
 void html_tag_arg_add(tag_arguments_t *tags, const char *tag, char *value);

+void html_form_data_tag_free(form_data_t *tags);
+
 #endif
--- a/libclamav/others.h
+++ b/libclamav/others.h
@ -552,6 +552,7 @@ extern LIBCLAMAV_EXPORT int have_rar;
 #define SCAN_HEURISTICS (ctx->options->general & CL_SCAN_GENERAL_HEURISTICS)
 #define SCAN_HEURISTIC_PRECEDENCE (ctx->options->general & CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE)
 #define SCAN_UNPRIVILEGED (ctx->options->general & CL_SCAN_GENERAL_UNPRIVILEGED)
+#define SCAN_STORE_HTML_URLS (ctx->options->general & CL_SCAN_GENERAL_STORE_HTML_URLS)

 #define SCAN_PARSE_ARCHIVE (ctx->options->parse & CL_SCAN_PARSE_ARCHIVE)
 #define SCAN_PARSE_ELF (ctx->options->parse & CL_SCAN_PARSE_ELF)
--- a/libclamav/scanners.c
+++ b/libclamav/scanners.c
@ -2082,6 +2082,452 @@ done:
    return ret;
 }

+const char *const HTML_URLS_JSON_KEY = "HTMLUrls";
+/* https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml  */
+const char *URI_LIST[] = {
+    "aaa://",
+    "aaas://",
+    "about://",
+    "acap://",
+    "acct://",
+    "acd://",
+    "acr://",
+    "adiumxtra://",
+    "adt://",
+    "afp://",
+    "afs://",
+    "aim://",
+    "amss://",
+    "android://",
+    "appdata://",
+    "apt://",
+    "ar://",
+    "ark://",
+    "at://",
+    "attachment://",
+    "aw://",
+    "barion://",
+    "bb://",
+    "beshare://",
+    "bitcoin://",
+    "bitcoincash://",
+    "blob://",
+    "bolo://",
+    "brid://",
+    "browserext://",
+    "cabal://",
+    "calculator://",
+    "callto://",
+    "cap://",
+    "cast://",
+    "casts://",
+    "chrome://",
+    "chrome-extension://",
+    "cid://",
+    "coap://",
+    "coap+tcp://",
+    "coap+ws://",
+    "coaps://",
+    "coaps+tcp://",
+    "coaps+ws://",
+    "com-eventbrite-attendee://",
+    "content://",
+    "content-type://",
+    "crid://",
+    "cstr://",
+    "cvs://",
+    "dab://",
+    "dat://",
+    "data://",
+    "dav://",
+    "dhttp://",
+    "diaspora://",
+    "dict://",
+    "did://",
+    "dis://",
+    "dlna-playcontainer://",
+    "dlna-playsingle://",
+    "dns://",
+    "dntp://",
+    "doi://",
+    "dpp://",
+    "drm://",
+    "drop://",
+    "dtmi://",
+    "dtn://",
+    "dvb://",
+    "dvx://",
+    "dweb://",
+    "ed2k://",
+    "eid://",
+    "elsi://",
+    "embedded://",
+    "ens://",
+    "ethereum://",
+    "example://",
+    "facetime://",
+    "fax://",
+    "feed://",
+    "feedready://",
+    "fido://",
+    "file://",
+    "filesystem://",
+    "finger://",
+    "first-run-pen-experience://",
+    "fish://",
+    "fm://",
+    "ftp://",
+    "fuchsia-pkg://",
+    "geo://",
+    "gg://",
+    "git://",
+    "gitoid://",
+    "gizmoproject://",
+    "go://",
+    "gopher://",
+    "graph://",
+    "grd://",
+    "gtalk://",
+    "h323://",
+    "ham://",
+    "hcap://",
+    "hcp://",
+    "hs20://",
+    "http://",
+    "https://",
+    "hxxp://",
+    "hxxps://",
+    "hydrazone://",
+    "hyper://",
+    "iax://",
+    "icap://",
+    "icon://",
+    "im://",
+    "imap://",
+    "info://",
+    "iotdisco://",
+    "ipfs://",
+    "ipn://",
+    "ipns://",
+    "ipp://",
+    "ipps://",
+    "irc://",
+    "irc6://",
+    "ircs://",
+    "iris://",
+    "iris.beep://",
+    "iris.lwz://",
+    "iris.xpc://",
+    "iris.xpcs://",
+    "isostore://",
+    "itms://",
+    "jabber://",
+    "jar://",
+    "jms://",
+    "keyparc://",
+    "lastfm://",
+    "lbry://",
+    "ldap://",
+    "ldaps://",
+    "leaptofrogans://",
+    "lid://",
+    "lorawan://",
+    "lpa://",
+    "lvlt://",
+    "machineProvisioningProgressReporter://",
+    "magnet://",
+    "mailserver://",
+    "mailto://",
+    "maps://",
+    "market://",
+    "matrix://",
+    "message://",
+    "microsoft.windows.camera://",
+    "microsoft.windows.camera.multipicker://",
+    "microsoft.windows.camera.picker://",
+    "mid://",
+    "mms://",
+    "modem://",
+    "mongodb://",
+    "moz://",
+    "ms-access://",
+    "ms-appinstaller://",
+    "ms-browser-extension://",
+    "ms-calculator://",
+    "ms-drive-to://",
+    "ms-enrollment://",
+    "ms-excel://",
+    "ms-eyecontrolspeech://",
+    "ms-gamebarservices://",
+    "ms-gamingoverlay://",
+    "ms-getoffice://",
+    "ms-help://",
+    "ms-infopath://",
+    "ms-inputapp://",
+    "ms-launchremotedesktop://",
+    "ms-lockscreencomponent-config://",
+    "ms-media-stream-id://",
+    "ms-meetnow://",
+    "ms-mixedrealitycapture://",
+    "ms-mobileplans://",
+    "ms-newsandinterests://",
+    "ms-officeapp://",
+    "ms-people://",
+    "ms-project://",
+    "ms-powerpoint://",
+    "ms-publisher://",
+    "ms-recall://",
+    "ms-remotedesktop://",
+    "ms-remotedesktop-launch://",
+    "ms-restoretabcompanion://",
+    "ms-screenclip://",
+    "ms-screensketch://",
+    "ms-search://",
+    "ms-search-repair://",
+    "ms-secondary-screen-controller://",
+    "ms-secondary-screen-setup://",
+    "ms-settings://",
+    "ms-settings-airplanemode://",
+    "ms-settings-bluetooth://",
+    "ms-settings-camera://",
+    "ms-settings-cellular://",
+    "ms-settings-cloudstorage://",
+    "ms-settings-connectabledevices://",
+    "ms-settings-displays-topology://",
+    "ms-settings-emailandaccounts://",
+    "ms-settings-language://",
+    "ms-settings-location://",
+    "ms-settings-lock://",
+    "ms-settings-nfctransactions://",
+    "ms-settings-notifications://",
+    "ms-settings-power://",
+    "ms-settings-privacy://",
+    "ms-settings-proximity://",
+    "ms-settings-screenrotation://",
+    "ms-settings-wifi://",
+    "ms-settings-workplace://",
+    "ms-spd://",
+    "ms-stickers://",
+    "ms-sttoverlay://",
+    "ms-transit-to://",
+    "ms-useractivityset://",
+    "ms-virtualtouchpad://",
+    "ms-visio://",
+    "ms-walk-to://",
+    "ms-whiteboard://",
+    "ms-whiteboard-cmd://",
+    "ms-word://",
+    "msnim://",
+    "msrp://",
+    "msrps://",
+    "mss://",
+    "mt://",
+    "mtqp://",
+    "mumble://",
+    "mupdate://",
+    "mvn://",
+    "mvrp://",
+    "mvrps://",
+    "news://",
+    "nfs://",
+    "ni://",
+    "nih://",
+    "nntp://",
+    "notes://",
+    "num://",
+    "ocf://",
+    "oid://",
+    "onenote://",
+    "onenote-cmd://",
+    "opaquelocktoken://",
+    "openid://",
+    "openpgp4fpr://",
+    "otpauth://",
+    "p1://",
+    "pack://",
+    "palm://",
+    "paparazzi://",
+    "payment://",
+    "payto://",
+    "pkcs11://",
+    "platform://",
+    "pop://",
+    "pres://",
+    "prospero://",
+    "proxy://",
+    "pwid://",
+    "psyc://",
+    "pttp://",
+    "qb://",
+    "query://",
+    "quic-transport://",
+    "redis://",
+    "rediss://",
+    "reload://",
+    "res://",
+    "resource://",
+    "rmi://",
+    "rsync://",
+    "rtmfp://",
+    "rtmp://",
+    "rtsp://",
+    "rtsps://",
+    "rtspu://",
+    "sarif://",
+    "secondlife://",
+    "secret-token://",
+    "service://",
+    "session://",
+    "sftp://",
+    "sgn://",
+    "shc://",
+    "shttp://",
+    "sieve://",
+    "simpleledger://",
+    "simplex://",
+    "sip://",
+    "sips://",
+    "skype://",
+    "smb://",
+    "smp://",
+    "sms://",
+    "smtp://",
+    "snews://",
+    "snmp://",
+    "soap.beep://",
+    "soap.beeps://",
+    "soldat://",
+    "spiffe://",
+    "spotify://",
+    "ssb://",
+    "ssh://",
+    "starknet://",
+    "steam://",
+    "stun://",
+    "stuns://",
+    "submit://",
+    "svn://",
+    "swh://",
+    "swid://",
+    "swidpath://",
+    "tag://",
+    "taler://",
+    "teamspeak://",
+    "tel://",
+    "teliaeid://",
+    "telnet://",
+    "tftp://",
+    "things://",
+    "thismessage://",
+    "tip://",
+    "tn3270://",
+    "tool://",
+    "turn://",
+    "turns://",
+    "tv://",
+    "udp://",
+    "unreal://",
+    "upt://",
+    "urn://",
+    "ut2004://",
+    "uuid-in-package://",
+    "v-event://",
+    "vemmi://",
+    "ventrilo://",
+    "ves://",
+    "videotex://",
+    "vnc://",
+    "view-source://",
+    "vscode://",
+    "vscode-insiders://",
+    "vsls://",
+    "w3://",
+    "wais://",
+    "web3://",
+    "wcr://",
+    "webcal://",
+    "web+ap://",
+    "wifi://",
+    "wpid://",
+    "ws://",
+    "wss://",
+    "wtai://",
+    "wyciwyg://",
+    "xcon://",
+    "xcon-userid://",
+    "xfire://",
+    "xmlrpc.beep://",
+    "xmlrpc.beeps://",
+    "xmpp://",
+    "xftp://",
+    "xrcp://",
+    "xri://",
+    "ymsgr://",
+    "z39.50://",
+    "z39.50r://",
+    "z39.50s://"};
+
+static bool is_url(const char *const str, size_t str_len)
+{
+    bool bRet = false;
+    size_t i;
+
+    for (i = 0; i < sizeof(URI_LIST) / sizeof(URI_LIST[0]); i++) {
+        if (str && (str_len > strlen(URI_LIST[i])) && (0 == strncasecmp(str, URI_LIST[i], strlen(URI_LIST[i])))) {
+            bRet = true;
+            goto done;
+        }
+    }
+done:
+    return bRet;
+}
+
+static void save_urls(cli_ctx *ctx, tag_arguments_t *hrefs, form_data_t *form_data)
+{
+    int i            = 0;
+    json_object *ary = NULL;
+
+    if (NULL == hrefs) {
+        return;
+    }
+
+    if (ctx->wrkproperty != ctx->properties) {
+        return;
+    }
+
+    if (!(SCAN_STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL))) {
+        return;
+    }
+
+    /*Add hrefs*/
+    for (i = 0; i < hrefs->count; i++) {
+        if (is_url((const char *)hrefs->value[i], strlen((const char *)hrefs->value[i]))) {
+            if (NULL == ary) {
+                ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY);
+                if (!ary) {
+                    cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY);
+                    return;
+                }
+            }
+            cli_jsonstr(ary, NULL, (const char *)hrefs->value[i]);
+        }
+    }
+
+    /*Add form_data*/
+    for (i = 0; i < (int)form_data->count; i++) {
+        if (is_url((const char *)form_data->urls[i], strlen((const char *)form_data->urls[i]))) {
+            if (NULL == ary) {
+                ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY);
+                if (!ary) {
+                    cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY);
+                    return;
+                }
+            }
+            cli_jsonstr(ary, NULL, (const char *)form_data->urls[i]);
+        }
+    }
+}
+
 static cl_error_t cli_scanhtml(cli_ctx *ctx)
 {
    cl_error_t status = CL_SUCCESS;
@ -2113,7 +2559,18 @@ static cl_error_t cli_scanhtml(cli_ctx *ctx)

    cli_dbgmsg("cli_scanhtml: using tempdir %s\n", tempname);

-    (void)html_normalise_map(ctx, map, tempname, NULL, ctx->dconf);
+    /* Output JSON Summary Information */
+    if (SCAN_STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL)) {
+        tag_arguments_t hrefs = {0};
+        hrefs.scanContents    = 1;
+        form_data_t form_data = {0};
+        (void)html_normalise_map_form_data(ctx, map, tempname, &hrefs, ctx->dconf, &form_data);
+        save_urls(ctx, &hrefs, &form_data);
+        html_tag_arg_free(&hrefs);
+        html_form_data_tag_free(&form_data);
+    } else {
+        (void)html_normalise_map(ctx, map, tempname, NULL, ctx->dconf);
+    }

    snprintf(fullname, 1024, "%s" PATHSEP "nocomment.html", tempname);
    fd = open(fullname, O_RDONLY | O_BINARY);
--- a/unit_tests/clamscan/save_html_urls_test.py
+++ b/unit_tests/clamscan/save_html_urls_test.py
@ -0,0 +1,62 @@
+# Copyright (C) 2020-2024 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
+
+"""
+Run clamscan tests.
+"""
+
+import sys
+import os
+import re
+import shutil
+
+sys.path.append('../unit_tests')
+import testcase
+
+
+class TC(testcase.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super(TC, cls).setUpClass()
+
+    @classmethod
+    def tearDownClass(cls):
+        super(TC, cls).tearDownClass()
+
+    def setUp(self):
+        super(TC, self).setUp()
+
+    def tearDown(self):
+        super(TC, self).tearDown()
+
+        # Remove scan temps directory between tests
+        if (self.path_tmp / "TD").exists():
+            shutil.rmtree(self.path_tmp / "TD")
+
+        self.verify_valgrind_log()
+
+    def test_save_links(self):
+        self.step_name('Extract Links')
+
+        tempdir=self.path_tmp / "TD"
+        if not os.path.isdir(tempdir):
+            os.makedirs(tempdir);
+
+        testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'html' / 'index.html'
+        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format(
+            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
+            path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb',
+            tempdir=tempdir,
+            testfile=testfile,
+        )
+        output = self.execute_command(command)
+
+        assert output.ec == 0  # clean
+
+        expected_strings = [ 'HTMLUrls'
+                , '"https://www.clamav.net/reports/malware"'
+                , '"http://www.google.com"'
+                ]
+        self.verify_metadata_json(tempdir, expected_strings)
+
+
+
--- a/unit_tests/input/other_scanfiles/html/index.html
+++ b/unit_tests/input/other_scanfiles/html/index.html
@ -0,0 +1,16 @@
+<!DOCTYPE html>
+<html>
+<body>
+
+<h1>Save Links Unittest</h1>
+<p>Paragraph</p>
+<a href="https://www.clamav.net/reports/malware">Report Malware</a>
+
+<form action="http://www.google.com">
+    <input type="submit">
+</form>
+
+
+</body>
+</html>
+
--- a/win32/conf_examples/clamd.conf.sample
+++ b/win32/conf_examples/clamd.conf.sample
@ -226,6 +226,12 @@ TCPAddr localhost
 # Default: no
 #GenerateMetadataJson yes

+# Store URLs found in html files to the json metadata.
+# URLs will be stored in an array with the tag 'HTMLUrls'
+# GenerateMetadataJson is required for this feature.
+# Default: yes (if GenerateMetadataJson is used)
+#JsonStoreHTMLUrls no
+
 # Permit use of the ALLMATCHSCAN command. If set to no, clamd will reject
 # any ALLMATCHSCAN command as invalid.
 # Default: yes