📄 engine.c
字号:
/* ssdeep Copyright (C) 2006 ManTech CFIA. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA The code in this file, and this file only, is based on SpamSum, part of the Samba project: http://www.samba.org/ftp/unpacked/junkcode/spamsum/ Because of where this file came from, any program that contains it must be licensed under the terms of the General Public License (GPL). See the file COPYING for details. The author's original comments about licensing are below: this is a checksum routine that is specifically designed for spam. Copyright Andrew Tridgell <tridge@samba.org> 2002 This code is released under the GNU General Public License version 2 or later. Alteratively, you may also use this code under the terms of the Perl Artistic license. If you wish to distribute this code under the terms of a different free software license then please ask me. If there is a good reason then I will probably say yes.*/#include "main.h"#ifndef MIN#define MIN(a,b) ((a)<(b)?(a):(b))#endif#ifndef MAX#define MAX(a,b) ((a)>(b)?(a):(b))#endiftypedef unsigned char uchar;// the output is a string of length 64 in base64#define SPAMSUM_LENGTH 64#define MAX_RESULT (SPAMSUM_LENGTH + (SPAMSUM_LENGTH/2 + 20))#define MIN_BLOCKSIZE 3#define ROLLING_WINDOW 7#define HASH_PRIME 0x01000193#define HASH_INIT 0x28021967// Our input buffer when reading files to hash#define BUFFER_SIZE 8192static struct { uchar window[ROLLING_WINDOW]; uint32_t h1, h2, h3; uint32_t n;} roll_state;/* a rolling hash, based on the Adler checksum. By using a rolling hash we can perform auto resynchronisation after inserts/deletes internally, h1 is the sum of the bytes in the window and h2 is the sum of the bytes times the index h3 is a shift/xor based rolling hash, and is mostly needed to ensure that we can cope with large blocksize values*/static inline uint32_t roll_hash(uchar c){ roll_state.h2 -= roll_state.h1; roll_state.h2 += ROLLING_WINDOW * c; roll_state.h1 += c; roll_state.h1 -= roll_state.window[roll_state.n % ROLLING_WINDOW]; roll_state.window[roll_state.n % ROLLING_WINDOW] = c; roll_state.n++; /* The original spamsum AND'ed this value with 0xFFFFFFFF which in theory should have no effect. This AND has been removed for performance (jk) */ roll_state.h3 = (roll_state.h3 << 5); //& 0xFFFFFFFF; roll_state.h3 ^= c; return roll_state.h1 + roll_state.h2 + roll_state.h3;}/* reset the state of the rolling hash and return the initial rolling hash value*/static uint32_t roll_reset(void){ memset(&roll_state, 0, sizeof(roll_state)); return 0;}/* a simple non-rolling hash, based on the FNV hash */static inline uint32_t sum_hash(uchar c, uint32_t h){ h *= HASH_PRIME; h ^= c; return h;}typedef struct _ss_context { char *ret, *p; uint32_t total_chars; uint32_t h, h2, h3; uint32_t j, n, i, k; uint32_t block_size; uchar ret2[SPAMSUM_LENGTH/2 + 1];} ss_context;static void ss_destroy(ss_context *ctx){ if (ctx->ret != NULL) free(ctx->ret);}static int ss_init(ss_context *ctx, FILE *handle){ ctx->ret = (char *)malloc(sizeof(char) * MAX_RESULT); if (ctx->ret == NULL) return TRUE; ctx->total_chars = find_file_size(handle); ctx->block_size = MIN_BLOCKSIZE; while (ctx->block_size * SPAMSUM_LENGTH < ctx->total_chars) { ctx->block_size = ctx->block_size * 2; } return FALSE;}const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";static void ss_engine(ss_context *ctx, unsigned char *buffer, uint32_t buffer_size){ uint32_t i; for ( i = 0 ; i < buffer_size ; ++i) { /* at each character we update the rolling hash and the normal hash. When the rolling hash hits the reset value then we emit the normal hash as a element of the signature and reset both hashes */ ctx->h = roll_hash(buffer[i]); ctx->h2 = sum_hash(buffer[i], ctx->h2); ctx->h3 = sum_hash(buffer[i], ctx->h3); if (ctx->h % ctx->block_size == (ctx->block_size-1)) { /* we have hit a reset point. We now emit a hash which is based on all chacaters in the piece of the message between the last reset point and this one */ ctx->p[ctx->j] = b64[ctx->h2 % 64]; if (ctx->j < SPAMSUM_LENGTH-1) { /* we can have a problem with the tail overflowing. The easiest way to cope with this is to only reset the second hash if we have room for more characters in our signature. This has the effect of combining the last few pieces of the message into a single piece */ ctx->h2 = HASH_INIT; (ctx->j)++; } } /* this produces a second signature with a block size of block_size*2. By producing dual signatures in this way the effect of small changes in the message size near a block size boundary is greatly reduced. */ if (ctx->h % (ctx->block_size*2) == ((ctx->block_size*2)-1)) { ctx->ret2[ctx->k] = b64[ctx->h3 % 64]; if (ctx->k < SPAMSUM_LENGTH/2-1) { ctx->h3 = HASH_INIT; (ctx->k)++; } } }}static int ss_update(ss_context *ctx, FILE *handle){ uint32_t bytes_read; unsigned char *buffer; buffer = (unsigned char *)malloc(sizeof(unsigned char) * BUFFER_SIZE); if (buffer == NULL) return TRUE; snprintf(ctx->ret, 12, "%u:", ctx->block_size); ctx->p = ctx->ret + strlen(ctx->ret); memset(ctx->p, 0, SPAMSUM_LENGTH+1); memset(ctx->ret2, 0, sizeof(ctx->ret2)); ctx->k = ctx->j = 0; ctx->h3 = ctx->h2 = HASH_INIT; ctx->h = roll_reset(); while ((bytes_read = fread(buffer,sizeof(unsigned char),BUFFER_SIZE,handle)) > 0) { ss_engine(ctx,buffer,bytes_read); } if (ctx->h != 0) { ctx->p[ctx->j] = b64[ctx->h2 % 64]; ctx->ret2[ctx->k] = b64[ctx->h3 % 64]; } strcat(ctx->p+ctx->j, ":"); strcat(ctx->p+ctx->j, ctx->ret2); free(buffer); return FALSE;}static int ss_compute(FILE *handle, char *result){ int done = FALSE; ss_context *ctx = (ss_context *)malloc(sizeof(ss_context)); if (ctx == NULL) return TRUE; ss_init(ctx, handle); while (!done) { rewind(handle); ss_update(ctx,handle); /* our blocksize guess may have been way off - repeat if necessary */ if (ctx->block_size > MIN_BLOCKSIZE && ctx->j < SPAMSUM_LENGTH/2) ctx->block_size = ctx->block_size / 2; else done = TRUE; } strncpy(result,ctx->ret,MAX_RESULT); ss_destroy(ctx); free(ctx); return FALSE;}/* we only accept a match if we have at least one common substring in the signature of length ROLLING_WINDOW. This dramatically drops the false positive rate for low score thresholds while having negligable affect on the rate of spam detection. return 1 if the two strings do have a common substring, 0 otherwise*/static int has_common_substring(const char *s1, const char *s2){ int i, j; int num_hashes; uint32_t hashes[SPAMSUM_LENGTH]; /* there are many possible algorithms for common substring detection. In this case I am re-using the rolling hash code to act as a filter for possible substring matches */ roll_reset(); memset(hashes, 0, sizeof(hashes)); /* first compute the windowed rolling hash at each offset in the first string */ for (i=0;s1[i];i++) { hashes[i] = roll_hash((uchar)s1[i]); } num_hashes = i; roll_reset();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -