From 8a77214c8292f5f7ce94dee7b4f363b5c14d91af Mon Sep 17 00:00:00 2001 From: Val Snyder Date: Thu, 27 Mar 2025 14:15:48 -0400 Subject: [PATCH] Add CL_TYPE_AI_MODEL and associated file type magic signatures This is just preliminary support for identifying an assortment of different AI model files. So far, this detects the following types: - GGML GGUF (.gguf) - ONNX AI (.onnx) - TensorFlow Lite (.tflite) Additional types to consider: - SafeTensors (.safetensors) - TensorFlow (.pb, .ckpt, .tfrecords) - Keras (.keras) - pickle (.pkl) - numpy (.npy, .npz) - coreml (.coreml) - PyTorch (.pt, .pth, .bin, .mar, .pte, .pt2, .ptl) Outside of being able to differentiate by file type, the scanner will treat CL_TYPE_AI_MODEL the same as CL_TYPE_BINARY_DATA. We're not adding parsers to further process these files, for now. --- libclamav/filetypes.c | 1 + libclamav/filetypes.h | 1 + libclamav/filetypes_int.h | 22 ++++++++++++++++++++++ libclamav/scanners.c | 1 + 4 files changed, 25 insertions(+) diff --git a/libclamav/filetypes.c b/libclamav/filetypes.c index 6d81c8aa9..a31006a91 100644 --- a/libclamav/filetypes.c +++ b/libclamav/filetypes.c @@ -142,6 +142,7 @@ static const struct ftmap_s { { "CL_TYPE_ONENOTE", CL_TYPE_ONENOTE }, { "CL_TYPE_PYTHON_COMPILED", CL_TYPE_PYTHON_COMPILED }, { "CL_TYPE_LHA_LZH", CL_TYPE_LHA_LZH }, + { "CL_TYPE_AI_MODEL", CL_TYPE_AI_MODEL }, { NULL, CL_TYPE_IGNORED } }; // clang-format on diff --git a/libclamav/filetypes.h b/libclamav/filetypes.h index f1677e103..a6b804870 100644 --- a/libclamav/filetypes.h +++ b/libclamav/filetypes.h @@ -95,6 +95,7 @@ typedef enum cli_file { CL_TYPE_ONENOTE, CL_TYPE_PYTHON_COMPILED, CL_TYPE_LHA_LZH, + CL_TYPE_AI_MODEL, /* Section for partition types */ CL_TYPE_PART_ANY, /* unknown partition type */ diff --git a/libclamav/filetypes_int.h b/libclamav/filetypes_int.h index c25802162..47b477d05 100644 --- a/libclamav/filetypes_int.h +++ b/libclamav/filetypes_int.h @@ -302,5 +302,27 @@ static const char *ftypes_int[] = { "1:2:2d6c7a(73|34|35)2d:LHA archive using .LZS extension:CL_TYPE_ANY:CL_TYPE_LHA_LZH:210", "1:2:2d706d302d:LHA archive using PMarc (.PMA) extension:CL_TYPE_ANY:CL_TYPE_LHA_LZH:210", "0:0:414c5a01:ALZ:CL_TYPE_ANY:CL_TYPE_ALZ:210", + // GGML GGUF models + "0:0:4747554601000000:GGUF AI Model File:CL_TYPE_ANY:CL_TYPE_AI_MODEL:220", + "0:0:4747554602000000:GGUF AI Model File:CL_TYPE_ANY:CL_TYPE_AI_MODEL:220", + "0:0:4747554603000000:GGUF AI Model File:CL_TYPE_ANY:CL_TYPE_AI_MODEL:220", + // ONNX AI model detection, looking for: onnx_tool or onnx-tool + "1:0:08??12??6f6e6e78(2d|5f)746f6f6c:ONNX AI Model File:CL_TYPE_ANY:CL_TYPE_AI_MODEL:220", + // ONNX AI model detection, looking for: tf2onnx + "1:0:08??12??7466326f6e6e78:ONNX AI Model File:CL_TYPE_ANY:CL_TYPE_AI_MODEL:220", + // ONNX AI model detection, looking for: pytorch + "1:0:08??12??7079746f726368:ONNX AI Model File:CL_TYPE_ANY:CL_TYPE_AI_MODEL:220", + // ONNX AI model detection, looking for: caffe: + "1:0:08??12??63616666653a:ONNX AI Model File:CL_TYPE_ANY:CL_TYPE_AI_MODEL:220", + // ONNX AI model detection, looking for: OnnxMLTools: + "1:0:08??12??4f6e6e784d4c546f6f6c73:ONNX AI Model File:CL_TYPE_ANY:CL_TYPE_AI_MODEL:220", + // ONNX AI model detection, looking for: CNTK + "1:0:08??12??434e544b:ONNX AI Model File:CL_TYPE_ANY:CL_TYPE_AI_MODEL:220", + // ONNX AI model detection, looking for: onnx-caffe2: + "1:0:08??12??6f6e6e782d636166666532:ONNX AI Model File:CL_TYPE_ANY:CL_TYPE_AI_MODEL:220", + // ONNX AI model detection, looking for: onnx-caffe2: + "1:0:08??12??6f6e6e782d636166666532:ONNX AI Model File:CL_TYPE_ANY:CL_TYPE_AI_MODEL:220", + // tflite model detection + "0:4:54464c33:TensorFlow Lite Model File:CL_TYPE_ANY:CL_TYPE_AI_MODEL:220", NULL}; #endif diff --git a/libclamav/scanners.c b/libclamav/scanners.c index b32eeeca0..db5b81807 100644 --- a/libclamav/scanners.c +++ b/libclamav/scanners.c @@ -5448,6 +5448,7 @@ cl_error_t cli_magic_scan(cli_ctx *ctx, cli_file_t type) perf_nested_stop(ctx, PERFT_MACHO, PERFT_SCAN); break; + case CL_TYPE_AI_MODEL: case CL_TYPE_PYTHON_COMPILED: case CL_TYPE_BINARY_DATA: ret = cli_scan_fmap(ctx, CL_TYPE_OTHER, false, NULL, AC_SCAN_VIR, NULL, NULL);