diff --git a/clamav-devel/ChangeLog b/clamav-devel/ChangeLog index aa1bd0df2..de49aa21d 100644 --- a/clamav-devel/ChangeLog +++ b/clamav-devel/ChangeLog @@ -1,7 +1,13 @@ +Sat Dec 18 16:32:51 GMT 2004 (njh) +---------------------------------- + * libclamav/mbox.c: Removed the need for a extra (short lived) parse tree + on some mail formats. This will help memory and performance + in some scenarios + Thu Dec 16 15:31:45 GMT 2004 (njh) ---------------------------------- * libclamav: Added some test software that decodes emails without parsing - them first. It is not enabled by default, use at your + them first. It is not enabled by default, use at your own risk and look at the comments first. Cleared a few problems in the decoding algorithms found when testing the above code @@ -50,7 +56,7 @@ Sun Dec 12 20:34:03 GMT 2004 (njh) Sun Dec 12 19:40:10 UTC 2004 (acab) ----------------------------------- * libclamav: upx: improved PE rebuild - debug info on failure - + Fri Dec 10 15:21:48 GMT 2004 (njh) ---------------------------------- * libclamav/message.c: Warn if the content-type contains a blank entry diff --git a/clamav-devel/libclamav/mbox.c b/clamav-devel/libclamav/mbox.c index c7049aa16..41f7b7fc6 100644 --- a/clamav-devel/libclamav/mbox.c +++ b/clamav-devel/libclamav/mbox.c @@ -17,6 +17,9 @@ * * Change History: * $Log: mbox.c,v $ + * Revision 1.202 2004/12/18 16:32:10 nigelhorne + * Added parseEmailFile + * * Revision 1.201 2004/12/16 15:29:08 nigelhorne * Tidy and add mmap test code * @@ -591,7 +594,7 @@ * Compilable under SCO; removed duplicate code with message.c * */ -static char const rcsid[] = "$Id: mbox.c,v 1.201 2004/12/16 15:29:08 nigelhorne Exp $"; +static char const rcsid[] = "$Id: mbox.c,v 1.202 2004/12/18 16:32:10 nigelhorne Exp $"; #if HAVE_CONFIG_H #include "clamav-config.h" @@ -731,6 +734,7 @@ typedef enum { FALSE = 0, TRUE = 1 } bool; #define PARTIAL_DIR static int cli_parse_mbox(const char *dir, int desc, unsigned int options); +static message *parseEmailFile(FILE *fin, const table_t *rfc821Table, const char *firstLine); static message *parseEmailHeaders(const message *m, const table_t *rfc821Table); static int parseEmailHeader(message *m, const char *line, const table_t *rfc821Table); static int parseEmailBody(message *messageIn, text *textIn, const char *dir, const table_t *rfc821Table, const table_t *subtypeTable, unsigned int options); @@ -747,6 +751,7 @@ static char *rfc822comments(const char *in); #ifdef PARTIAL_DIR static int rfc1341(message *m, const char *dir); #endif +static bool usefulHeader(int commandNumber, const char *cmd); #ifdef notdef static const char *cli_pmemstr(const char *haystack, size_t hs, const char *needle, size_t ns); #endif @@ -1093,7 +1098,7 @@ static int cli_parse_mbox(const char *dir, int desc, unsigned int options) { int retcode, i; - message *m, *body; + message *body; FILE *fd; char buffer[LINE_LENGTH + 1]; #ifdef HAVE_BACKTRACE @@ -1114,12 +1119,6 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options) fclose(fd); return CL_CLEAN; } - m = messageCreate(); - if(m == NULL) { - fclose(fd); - return CL_EMEM; - } - #ifdef CL_THREAD_SAFE pthread_mutex_lock(&tables_mutex); #endif @@ -1132,7 +1131,6 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options) #ifdef CL_THREAD_SAFE pthread_mutex_unlock(&tables_mutex); #endif - messageDestroy(m); fclose(fd); return CL_EMEM; } @@ -1154,8 +1152,20 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options) * Have been asked to check a UNIX style mbox file, which * may contain more than one e-mail message to decode */ - bool lastLineWasEmpty = FALSE; - int messagenumber = 1; + bool lastLineWasEmpty; + int messagenumber; + message *m = messageCreate(); + + if(m == NULL) { + fclose(fd); +#ifdef HAVE_BACKTRACE + signal(SIGSEGV, segv); +#endif + return CL_EMEM; + } + + lastLineWasEmpty = FALSE; + messagenumber = 1; do { /*cli_dbgmsg("read: %s", buffer);*/ @@ -1196,11 +1206,14 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options) break; } while(fgets(buffer, sizeof(buffer) - 1, fd) != NULL); + fclose(fd); + cli_dbgmsg("Extract attachments from email %d\n", messagenumber); + body = parseEmailHeaders(m, rfc821); + messageDestroy(m); } else { /* * It's a single message, parse the headers then the body - * Ignore blank lines at the start of the message */ if(strncmp(buffer, "P I ", 4) == 0) /* @@ -1219,40 +1232,10 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options) buffer[sizeof(buffer) - 1] = '\0'; - /* - * FIXME: files full of new lines and nothing else are - * handled ungracefully... - */ - do { - const char *ptr; - - /* - * TODO: this needlessly creates a message object, - * it'd be better if parseEmailHeaders could also - * read in from a file. I do not want to lump the - * parseEmailHeaders code here, that'd be a duplication - * of code I want to avoid - */ - (void)cli_chomp(buffer); - - /* - * Ignore leading CR, e.g. if newlines are LFCR instead - * or CRLF - */ - for(ptr = buffer; *ptr == '\r'; ptr++) - ; - /* - * Don't blank lines which are only spaces from - * headers, otherwise they'll be treated as the end of - * header marker - */ - if(messageAddStr(m, ptr) < 0) - break; - } while(fgets(buffer, sizeof(buffer) - 1, fd) != NULL); + body = parseEmailFile(fd, rfc821, buffer); + fclose(fd); } - fclose(fd); - /* * This is not necessarily true, but since the only options are * CL_CLEAN and CL_VIRUS this is the better choice. It would be @@ -1260,8 +1243,6 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options) */ retcode = CL_CLEAN; - body = parseEmailHeaders(m, rfc821); - messageDestroy(m); if(body) { /* * Write out the last entry in the mailbox @@ -1286,9 +1267,186 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options) } /* - * The given message contains a raw e-mail. + * Read in an email message from fin, parse it, and return the message * - * This function parses the headers of m and sets the message's arguments + * FIXME: files full of new lines and nothing else are + * handled ungracefully... + */ +static message * +parseEmailFile(FILE *fin, const table_t *rfc821, const char *firstLine) +{ + bool inHeader = TRUE; + bool contMarker = FALSE; + message *ret; + bool anyHeadersFound = FALSE; + int commandNumber = -1; + char *fullline = NULL; + size_t fulllinelength = 0; + char buffer[LINE_LENGTH+1]; + + cli_dbgmsg("parseEmailFile\n"); + + ret = messageCreate(); + if(ret == NULL) + return NULL; + + strcpy(buffer, firstLine); + do { + const char *start; + + (void)cli_chomp(buffer); + /* + * Ignore leading CR, e.g. if newlines are LFCR instead + * or CRLF + */ + for(start = buffer; *start == '\r'; start++) + ; + + if(start[0] == '\0') + start = NULL; + + /* + * Don't blank lines which are only spaces from headers, + * otherwise they'll be treated as the end of header marker + */ + if(inHeader) { + cli_dbgmsg("parseEmailFile: check '%s'\n", start ? start : ""); + if(start == NULL) { /* empty line */ + if(!contMarker) { + /* + * A blank line signifies the end of + * the header and the start of the text + */ + cli_dbgmsg("End of header information\n"); + inHeader = FALSE; + } else + contMarker = FALSE; + } else { + char *ptr; + const char *qptr; + int quotes, lookahead; + + if(fullline == NULL) { + char cmd[LINE_LENGTH + 1]; + + /* + * Continuation of line we're ignoring? + */ + if((start[0] == '\t') || (start[0] == ' ') || contMarker) { + contMarker = continuationMarker(start); + continue; + } + + /* + * Is this a header we're interested in? + */ + if((strchr(start, ':') == NULL) || + (cli_strtokbuf(start, 0, ":", cmd) == NULL)) { + if(strncmp(start, "From ", 5) == 0) + anyHeadersFound = TRUE; + continue; + } + + ptr = rfc822comments(cmd); + commandNumber = tableFind(rfc821, ptr ? ptr : cmd); + if(ptr) + free(ptr); + + switch(commandNumber) { + case CONTENT_TRANSFER_ENCODING: + case CONTENT_DISPOSITION: + case CONTENT_TYPE: + anyHeadersFound = TRUE; + break; + default: + if(!anyHeadersFound) + anyHeadersFound = usefulHeader(commandNumber, cmd); + continue; + } + fullline = strdup(start); + fulllinelength = strlen(start) + 1; + } else if(start != NULL) { + fulllinelength += strlen(start); + fullline = cli_realloc(fullline, fulllinelength); + strcat(fullline, start); + } + + contMarker = continuationMarker(start); + + if(contMarker) + continue; + + assert(fullline != NULL); + + lookahead = getc(fin); + if(lookahead != EOF) { + ungetc(lookahead, fin); + + /* + * Section B.2 of RFC822 says TAB or + * SPACE means a continuation of the + * previous entry. + * + * Add all the arguments on the line + */ + if((lookahead == '\t') || (lookahead == ' ')) + continue; + } + + quotes = 0; + for(qptr = start; *qptr; qptr++) + if(*qptr == '\"') + quotes++; + + if(quotes & 1) + continue; + + ptr = rfc822comments(fullline); + if(ptr) { + free(fullline); + fullline = ptr; + } + + if(parseEmailHeader(ret, fullline, rfc821) < 0) + continue; + + free(fullline); + fullline = NULL; + } + } else + /*cli_dbgmsg("Add line to body '%s'\n", start);*/ + if(messageAddStr(ret, start) < 0) + break; + } while(fgets(buffer, sizeof(buffer) - 1, fin) != NULL); + + if(fullline) { + if(*fullline) switch(commandNumber) { + case CONTENT_TRANSFER_ENCODING: + case CONTENT_DISPOSITION: + case CONTENT_TYPE: + cli_warnmsg("parseEmailHeaders: Fullline set '%s' - report to bugs@clamav.net\n", fullline); + } + free(fullline); + } + + if(!anyHeadersFound) { + /* + * False positive in believing we have an e-mail when we don't + */ + messageDestroy(ret); + cli_dbgmsg("parseEmailFile: no headers found, assuming it isn't an email\n"); + return NULL; + } + + messageClean(ret); + + cli_dbgmsg("parseEmailFile: return\n"); + + return ret; +} + +/* + * The given message contains a raw e-mail. * * Returns the message's body with the correct arguments set * @@ -1296,6 +1454,8 @@ cli_parse_mbox(const char *dir, int desc, unsigned int options) * of the message in memory, the upside is that it makes for easier parsing * of encapsulated messages, and in the long run uses less memory in those * scenarios + * + * TODO: remove the duplication with parseEmailFile */ static message * parseEmailHeaders(const message *m, const table_t *rfc821) @@ -1326,23 +1486,21 @@ parseEmailHeaders(const message *m, const table_t *rfc821) if(inHeader) { cli_dbgmsg("parseEmailHeaders: check '%s'\n", buffer ? buffer : ""); - if((buffer == NULL) && !contMarker) { - /* - * A blank line signifies the end of the header - * and the start of the text - */ - cli_dbgmsg("End of header information\n"); - inHeader = FALSE; + if(buffer == NULL) { + if(!contMarker) { + /* + * A blank line signifies the end of + * the header and the start of the text + */ + cli_dbgmsg("End of header information\n"); + inHeader = FALSE; + } else + contMarker = FALSE; } else { char *ptr; const char *qptr; int quotes; - if(buffer == NULL) { - contMarker = FALSE; - continue; - } - if(fullline == NULL) { char cmd[LINE_LENGTH + 1]; @@ -1376,12 +1534,8 @@ parseEmailHeaders(const message *m, const table_t *rfc821) anyHeadersFound = TRUE; break; default: - if(strcasecmp(cmd, "From") == 0) - anyHeadersFound = TRUE; - else if(strcasecmp(cmd, "Received") == 0) - anyHeadersFound = TRUE; - else if(strcasecmp(cmd, "De") == 0) - anyHeadersFound = TRUE; + if(!anyHeadersFound) + anyHeadersFound = usefulHeader(commandNumber, cmd); continue; } fullline = strdup(buffer); @@ -1397,9 +1551,9 @@ parseEmailHeaders(const message *m, const table_t *rfc821) if(contMarker) continue; - if(t->t_next && (t->t_next->t_line != NULL)) { - const char *next = lineGetData(t->t_next->t_line); + assert(fullline != NULL); + if(t->t_next && (t->t_next->t_line != NULL)) /* * Section B.2 of RFC822 says TAB or * SPACE means a continuation of the @@ -1407,9 +1561,11 @@ parseEmailHeaders(const message *m, const table_t *rfc821) * * Add all the arguments on the line */ - if((next[0] == '\t') || (next[0] == ' ')) - continue; - } + switch(lineGetData(t->t_next->t_line)[0]) { + case ' ': + case '\t': + continue; + } quotes = 0; for(qptr = buffer; *qptr; qptr++) @@ -1425,13 +1581,11 @@ parseEmailHeaders(const message *m, const table_t *rfc821) fullline = ptr; } - if(fullline) { - if(parseEmailHeader(ret, fullline, rfc821) < 0) - continue; + if(parseEmailHeader(ret, fullline, rfc821) < 0) + continue; - free(fullline); - fullline = NULL; - } + free(fullline); + fullline = NULL; } } else /*cli_dbgmsg("Add line to body '%s'\n", buffer);*/ @@ -3725,6 +3879,26 @@ print_trace(int use_syslog) } #endif +static bool +usefulHeader(int commandNumber, const char *cmd) +{ + switch(commandNumber) { + case CONTENT_TRANSFER_ENCODING: + case CONTENT_DISPOSITION: + case CONTENT_TYPE: + return TRUE; + default: + if(strcasecmp(cmd, "From") == 0) + return TRUE; + else if(strcasecmp(cmd, "Received") == 0) + return TRUE; + else if(strcasecmp(cmd, "De") == 0) + return TRUE; + } + + return FALSE; +} + #ifdef notdef /* * like cli_memstr - but returns the location of the match