📄 engine.c~

📁 检测文件的相似性的一个小程序！使用Linux系统编写
💻 C~
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* ssdeep   Copyright (C) 2006 ManTech CFIA.   This program is free software; you can redistribute it and/or modify   it under the terms of the GNU General Public License as published by   the Free Software Foundation; either version 2 of the License, or   (at your option) any later version.      This program is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the   GNU General Public License for more details.      You should have received a copy of the GNU General Public License   along with this program; if not, write to the Free Software   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA   The code in this file, and this file only, is based on SpamSum, part    of the Samba project:          http://www.samba.org/ftp/unpacked/junkcode/spamsum/   Because of where this file came from, any program that contains it   must be licensed under the terms of the General Public License (GPL).   See the file COPYING for details. The author's original comments   about licensing are below:  this is a checksum routine that is specifically designed for spam.   Copyright Andrew Tridgell <tridge@samba.org> 2002  This code is released under the GNU General Public License version 2  or later.  Alteratively, you may also use this code under the terms  of the Perl Artistic license.  If you wish to distribute this code under the terms of a different  free software license then please ask me. If there is a good reason  then I will probably say yes.*/#include "main.h"#ifndef MIN#define MIN(a,b) ((a)<(b)?(a):(b))#endif#ifndef MAX#define MAX(a,b) ((a)>(b)?(a):(b))#endiftypedef unsigned char uchar;// the output is a string of length 64 in base64#define SPAMSUM_LENGTH 64#define MAX_RESULT    (SPAMSUM_LENGTH + (SPAMSUM_LENGTH/2 + 20))#define MIN_BLOCKSIZE  3#define ROLLING_WINDOW 7#define HASH_PRIME     0x01000193#define HASH_INIT      0x28021967// Our input buffer when reading files to hash#define BUFFER_SIZE  8192static struct {  uchar window[ROLLING_WINDOW];  uint32_t h1, h2, h3;  uint32_t n;} roll_state;/*  a rolling hash, based on the Adler checksum. By using a rolling hash  we can perform auto resynchronisation after inserts/deletes  internally, h1 is the sum of the bytes in the window and h2  is the sum of the bytes times the index  h3 is a shift/xor based rolling hash, and is mostly needed to ensure that  we can cope with large blocksize values*/static inline uint32_t roll_hash(uchar c){  roll_state.h2 -= roll_state.h1;  roll_state.h2 += ROLLING_WINDOW * c;    roll_state.h1 += c;  roll_state.h1 -= roll_state.window[roll_state.n % ROLLING_WINDOW];    roll_state.window[roll_state.n % ROLLING_WINDOW] = c;  roll_state.n++;    /* The original spamsum AND'ed this value with 0xFFFFFFFF which     in theory should have no effect. This AND has been removed      for performance (jk) */  roll_state.h3 = (roll_state.h3 << 5); //& 0xFFFFFFFF;  roll_state.h3 ^= c;    return roll_state.h1 + roll_state.h2 + roll_state.h3;}/*  reset the state of the rolling hash and return the initial rolling hash value*/static uint32_t roll_reset(void){	  memset(&roll_state, 0, sizeof(roll_state));  return 0;}/* a simple non-rolling hash, based on the FNV hash */static inline uint32_t sum_hash(uchar c, uint32_t h){  h *= HASH_PRIME;  h ^= c;  return h;}typedef struct _ss_context {  char *ret, *p;  uint32_t total_chars;  uint32_t h, h2, h3;  uint32_t j, n, i, k;  uint32_t block_size;  uchar ret2[SPAMSUM_LENGTH/2 + 1];} ss_context;static void ss_destroy(ss_context *ctx){  if (ctx->ret != NULL)    free(ctx->ret);}static int ss_init(ss_context *ctx, FILE *handle){  ctx->ret = (char *)malloc(sizeof(char) * MAX_RESULT);  if (ctx->ret == NULL)    return TRUE;  ctx->total_chars = find_file_size(handle);  ctx->block_size = MIN_BLOCKSIZE;  while (ctx->block_size * SPAMSUM_LENGTH < ctx->total_chars) {    ctx->block_size = ctx->block_size * 2;  }  return FALSE;}const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";static void ss_engine(ss_context *ctx, unsigned char *buffer, uint32_t buffer_size){  uint32_t i;  for ( i = 0 ; i < buffer_size ; ++i)  {    /*        at each character we update the rolling hash and       the normal hash. When the rolling hash hits the       reset value then we emit the normal hash as a       element of the signature and reset both hashes    */    ctx->h  = roll_hash(buffer[i]);    ctx->h2 = sum_hash(buffer[i], ctx->h2);    ctx->h3 = sum_hash(buffer[i], ctx->h3);        if (ctx->h % ctx->block_size == (ctx->block_size-1)) {      /* we have hit a reset point. We now emit a	 hash which is based on all chacaters in the	 piece of the message between the last reset	 point and this one */      ctx->p[ctx->j] = b64[ctx->h2 % 64];		      if (ctx->j < SPAMSUM_LENGTH-1) {	/* we can have a problem with the tail	   overflowing. The easiest way to	   cope with this is to only reset the	   second hash if we have room for	   more characters in our	   signature. This has the effect of	   combining the last few pieces of	   the message into a single piece */	ctx->h2 = HASH_INIT;	(ctx->j)++;      }    }        /* this produces a second signature with a block size       of block_size*2. By producing dual signatures in       this way the effect of small changes in the message       size near a block size boundary is greatly reduced. */    if (ctx->h % (ctx->block_size*2) == ((ctx->block_size*2)-1)) {      ctx->ret2[ctx->k] = b64[ctx->h3 % 64];      if (ctx->k < SPAMSUM_LENGTH/2-1) {	ctx->h3 = HASH_INIT;	(ctx->k)++;      }    }  }}static int ss_update(ss_context *ctx, FILE *handle){  uint32_t bytes_read;  unsigned char *buffer;   buffer = (unsigned char *)malloc(sizeof(unsigned char) * BUFFER_SIZE);  if (buffer == NULL)    return TRUE;  snprintf(ctx->ret, 12, "%u:", ctx->block_size);  ctx->p = ctx->ret + strlen(ctx->ret);    memset(ctx->p, 0, SPAMSUM_LENGTH+1);  memset(ctx->ret2, 0, sizeof(ctx->ret2));    ctx->k  = ctx->j  = 0;  ctx->h3 = ctx->h2 = HASH_INIT;  ctx->h  = roll_reset();  while ((bytes_read = fread(buffer,sizeof(unsigned char),BUFFER_SIZE,handle)) > 0)  {    ss_engine(ctx,buffer,bytes_read);  }  if (ctx->h != 0)   {    ctx->p[ctx->j] = b64[ctx->h2 % 64];    ctx->ret2[ctx->k] = b64[ctx->h3 % 64];  }    strcat(ctx->p+ctx->j, ":");  strcat(ctx->p+ctx->j, ctx->ret2);  free(buffer);  return FALSE;}static int ss_compute(FILE *handle, char *result){  int done = FALSE;  ss_context *ctx = (ss_context *)malloc(sizeof(ss_context));    if (ctx == NULL)    return TRUE;  ss_init(ctx, handle);  while (!done)  {    rewind(handle);    ss_update(ctx,handle);    /* our blocksize guess may have been way off - repeat if necessary */    if (ctx->block_size > MIN_BLOCKSIZE && ctx->j < SPAMSUM_LENGTH/2)       ctx->block_size = ctx->block_size / 2;    else      done = TRUE;  }  strncpy(result,ctx->ret,MAX_RESULT);  ss_destroy(ctx);  free(ctx);  return FALSE;}/*    we only accept a match if we have at least one common substring in   the signature of length ROLLING_WINDOW. This dramatically drops the   false positive rate for low score thresholds while having   negligable affect on the rate of spam detection.   return 1 if the two strings do have a common substring, 0 otherwise*/static int has_common_substring(const char *s1, const char *s2){  int i, j;  int num_hashes;  uint32_t hashes[SPAMSUM_LENGTH];    /* there are many possible algorithms for common substring     detection. In this case I am re-using the rolling hash code     to act as a filter for possible substring matches */    roll_reset();  memset(hashes, 0, sizeof(hashes));    /* first compute the windowed rolling hash at each offset in     the first string */  for (i=0;s1[i];i++)   {    hashes[i] = roll_hash((uchar)s1[i]);  }  num_hashes = i;    roll_reset();
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -