diff --git a/libclamav/pdf.c b/libclamav/pdf.c index 9dc82bb27..4da919563 100644 --- a/libclamav/pdf.c +++ b/libclamav/pdf.c @@ -3511,22 +3511,86 @@ static void pdf_export_json(struct pdf_struct *pdf) goto cleanup; } - if (pdf->stats.author) - cli_jsonstr(pdfobj, "Author", pdf->stats.author); - if (pdf->stats.creator) - cli_jsonstr(pdfobj, "Creator", pdf->stats.creator); - if (pdf->stats.producer) - cli_jsonstr(pdfobj, "Producer", pdf->stats.producer); - if (pdf->stats.modificationdate) - cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate); - if (pdf->stats.creationdate) - cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate); - if (pdf->stats.title) - cli_jsonstr(pdfobj, "Title", pdf->stats.title); - if (pdf->stats.subject) - cli_jsonstr(pdfobj, "Subject", pdf->stats.subject); - if (pdf->stats.keywords) - cli_jsonstr(pdfobj, "Keywords", pdf->stats.keywords); + if (pdf->stats.author) { + if (cli_isutf8(pdf->stats.author, strlen(pdf->stats.author))) + cli_jsonstr(pdfobj, "Author", pdf->stats.author); + else { + char *b64 = (char *)cl_base64_encode(pdf->stats.author, strlen(pdf->stats.author)); + cli_jsonstr(pdfobj, "Author", b64); + cli_jsonbool(pdfobj, "Author_base64", 1); + free(b64); + } + } + if (pdf->stats.creator) { + if (cli_isutf8(pdf->stats.creator, strlen(pdf->stats.creator))) + cli_jsonstr(pdfobj, "Creator", pdf->stats.creator); + else { + char *b64 = (char *)cl_base64_encode(pdf->stats.creator, strlen(pdf->stats.creator)); + cli_jsonstr(pdfobj, "Creator", b64); + cli_jsonbool(pdfobj, "Creator_base64", 1); + free(b64); + } + } + if (pdf->stats.producer) { + if (cli_isutf8(pdf->stats.producer, strlen(pdf->stats.producer))) + cli_jsonstr(pdfobj, "Producer", pdf->stats.producer); + else { + char *b64 = (char *)cl_base64_encode(pdf->stats.producer, strlen(pdf->stats.producer)); + cli_jsonstr(pdfobj, "Producer", b64); + cli_jsonbool(pdfobj, "Producer_base64", 1); + free(b64); + } + } + if (pdf->stats.modificationdate) { + if (cli_isutf8(pdf->stats.modificationdate, strlen(pdf->stats.modificationdate))) + cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate); + else { + char *b64 = (char *)cl_base64_encode(pdf->stats.modificationdate, strlen(pdf->stats.modificationdate)); + cli_jsonstr(pdfobj, "ModificationDate", b64); + cli_jsonbool(pdfobj, "ModificationDate_base64", 1); + free(b64); + } + } + if (pdf->stats.creationdate) { + if (cli_isutf8(pdf->stats.creationdate, strlen(pdf->stats.creationdate))) + cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate); + else { + char *b64 = (char *)cl_base64_encode(pdf->stats.creationdate, strlen(pdf->stats.creationdate)); + cli_jsonstr(pdfobj, "CreationDate", b64); + cli_jsonbool(pdfobj, "CreationDate_base64", 1); + free(b64); + } + } + if (pdf->stats.title) { + if (cli_isutf8(pdf->stats.title, strlen(pdf->stats.title))) + cli_jsonstr(pdfobj, "Title", pdf->stats.title); + else { + char *b64 = (char *)cl_base64_encode(pdf->stats.title, strlen(pdf->stats.title)); + cli_jsonstr(pdfobj, "Title", b64); + cli_jsonbool(pdfobj, "Title_base64", 1); + free(b64); + } + } + if (pdf->stats.subject) { + if (cli_isutf8(pdf->stats.subject, strlen(pdf->stats.subject))) + cli_jsonstr(pdfobj, "Subject", pdf->stats.subject); + else { + char *b64 = (char *)cl_base64_encode(pdf->stats.subject, strlen(pdf->stats.subject)); + cli_jsonstr(pdfobj, "Subject", b64); + cli_jsonbool(pdfobj, "Subject_base64", 1); + free(b64); + } + } + if (pdf->stats.keywords) { + if (cli_isutf8(pdf->stats.keywords, strlen(pdf->stats.keywords))) + cli_jsonstr(pdfobj, "Keywords", pdf->stats.keywords); + else { + char *b64 = (char *)cl_base64_encode(pdf->stats.keywords, strlen(pdf->stats.keywords)); + cli_jsonstr(pdfobj, "Keywords", b64); + cli_jsonbool(pdfobj, "Keywords_base64", 1); + free(b64); + } + } if (pdf->stats.ninvalidobjs) cli_jsonint(pdfobj, "InvalidObjectCount", pdf->stats.ninvalidobjs); if (pdf->stats.njs) diff --git a/libclamav/str.c b/libclamav/str.c index 6e821e4b8..9c0d31902 100644 --- a/libclamav/str.c +++ b/libclamav/str.c @@ -666,3 +666,49 @@ char *cli_utf16_to_utf8(const char *utf16, size_t length, utf16_type type) s2[j] = '\0'; return s2; } + +int cli_isutf8(const unsigned char *buf, unsigned int len) +{ + unsigned int i, j; + + for(i = 0; i < len; i++) { + if((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ + continue; + } else if((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ + return 0; + } else { + unsigned int following; + + if((buf[i] & 0x20) == 0) { /* 110xxxxx */ + /* c = buf[i] & 0x1f; */ + following = 1; + } else if((buf[i] & 0x10) == 0) { /* 1110xxxx */ + /* c = buf[i] & 0x0f; */ + following = 2; + } else if((buf[i] & 0x08) == 0) { /* 11110xxx */ + /* c = buf[i] & 0x07; */ + following = 3; + } else if((buf[i] & 0x04) == 0) { /* 111110xx */ + /* c = buf[i] & 0x03; */ + following = 4; + } else if((buf[i] & 0x02) == 0) { /* 1111110x */ + /* c = buf[i] & 0x01; */ + following = 5; + } else { + return 0; + } + + for(j = 0; j < following; j++) { + if(++i >= len) + return 0; + + if((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) + return 0; + + /* c = (c << 6) + (buf[i] & 0x3f); */ + } + } + } + + return 1; +} diff --git a/libclamav/str.h b/libclamav/str.h index 7e05a3c4f..7192dc196 100644 --- a/libclamav/str.h +++ b/libclamav/str.h @@ -58,5 +58,7 @@ typedef enum { } utf16_type; char *cli_utf16_to_utf8(const char *utf16, size_t length, utf16_type type); +int cli_isutf8(const unsigned char *buf, unsigned int len); + size_t cli_strlcat(char *dst, const char *src, size_t sz); /* libclamav/strlcat.c */ #endif