📄 engine.c

📁 检测文件的相似性的一个小程序！使用Linux系统编写
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
    /* now for each offset in the second string compute the     rolling hash and compare it to all of the rolling hashes     for the first string. If one matches then we have a     candidate substring match. We then confirm that match with     a direct string comparison */  for (i=0;s2[i];i++) {    uint32_t h = roll_hash((uchar)s2[i]);    if (i < ROLLING_WINDOW-1) continue;    for (j=ROLLING_WINDOW-1;j<num_hashes;j++)     {//	printf("hashes[%d]=%d and %d ",j,hashes[j],h);////////////9.24      if (hashes[j] != 0 && hashes[j] == h)       {//	printf("test 21  \n");////////9.21号	/* we have a potential match - confirm it */	if (strlen(s2+i-(ROLLING_WINDOW-1)) >= ROLLING_WINDOW && 	    strncmp(s2+i-(ROLLING_WINDOW-1), 		    s1+j-(ROLLING_WINDOW-1), 		    ROLLING_WINDOW) == 0) 	{	  return 1;	}      }    }  }    return 0;}/*  eliminate sequences of longer than 3 identical characters. These  sequences contain very little information so they tend to just bias  the result unfairly*/static char *eliminate_sequences(const char *str){  char *ret;  int i, j, len;    ret = strdup(str);  if (!ret) return NULL;    len = strlen(str);    for (i=j=3;i<len;i++) {    if (str[i] != str[i-1] ||	str[i] != str[i-2] ||	str[i] != str[i-3]) {      ret[j++] = str[i];    }  }    ret[j] = 0;    return ret;}/*  this is the low level string scoring algorithm. It takes two strings  and scores them on a scale of 0-100 where 0 is a terrible match and  100 is a great match. The block_size is used to cope with very small  messages.*/static unsigned score_strings(const char *s1, const char *s2, uint32_t block_size){  uint32_t score;  uint32_t len1, len2;  int edit_distn(const char *from, int from_len, const char *to, int to_len);    len1 = strlen(s1);  len2 = strlen(s2);  if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) {    /* not a real spamsum signature? */    return 0;  }    /* the two strings must have a common substring of length     ROLLING_WINDOW to be candidates *///	char string1[]="m2AuCDzuaEWrZeDBNNQtWjcZud5jyzpWeop40cdf";//	char string2[]="m2AuCOzuaEWrZeDBKNQtWjjZud5jyepWeop4dcdf";  if (has_common_substring(s1, s2) == 0) {	printf("\nthe string1 and string2 has no comman substring!\n");    return 0;  }	    /* compute the edit distance between the two strings. The edit distance gives     us a pretty good idea of how closely related the two strings are */  score = edit_distn(s1, len1, s2, len2);   printf("s1 and s2 edit_distance is %d\n",score);  /* scale the edit distance by the lengths of the two     strings. This changes the score to be a measure of the     proportion of the message that has changed rather than an     absolute quantity. It also copes with the variability of     the string lengths. */  score = (score * SPAMSUM_LENGTH) / (len1 + len2);    /* at this stage the score occurs roughly on a 0-64 scale,   * with 0 being a good match and 64 being a complete   * mismatch */    /* rescale to a 0-100 scale (friendlier to humans) */  score = (100 * score) / 64;    /* it is possible to get a score above 100 here, but it is a     really terrible match */  if (score >= 100) return 0;    /* now re-scale on a 0-100 scale with 0 being a poor match and     100 being a excellent match. */  score = 100 - score;  //  printf ("len1: %"PRIu32"  len2: %"PRIu32"\n", len1, len2);    /* when the blocksize is small we don't want to exaggerate the match size */  if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) {    score = block_size/MIN_BLOCKSIZE * MIN(len1, len2);  }  return score;}/*  given two spamsum strings return a value indicating the degree to which they match.*/uint32_t spamsum_match(state *s, const char *str1, const char *str2){  uint32_t block_size1, block_size2;  uint32_t score = 0;  char *s1, *s2;  char *s1_1, *s1_2;  char *s2_1, *s2_2;    /* each spamsum is prefixed by its block size */  if (sscanf(str1, "%u:", &block_size1) != 1 ||      sscanf(str2, "%u:", &block_size2) != 1) {    return 0;  }    /* if the blocksizes don't match then we are comparing     apples to oranges ... */  if (block_size1 != block_size2 &&       block_size1 != block_size2*2 &&      block_size2 != block_size1*2) {    return 0;  }    /* move past the prefix */  str1 = strchr(str1, ':');  str2 = strchr(str2, ':');    if (!str1 || !str2) {    /* badly formed ... */    return 0;  }    /* there is very little information content is sequences of     the same character like 'LLLLL'. Eliminate any sequences     longer than 3. This is especially important when combined     with the has_common_substring() test below. */  s1 = eliminate_sequences(str1+1);  s2 = eliminate_sequences(str2+1);    if (!s1 || !s2) return 0;    /* now break them into the two pieces */  s1_1 = s1;  s2_1 = s2;    s1_2 = strchr(s1, ':');  s2_2 = strchr(s2, ':');    if (!s1_2 || !s2_2) {    /* a signature is malformed - it doesn't have 2 parts */    free(s1); free(s2);    return 0;  }  *s1_2++ = 0;  *s2_2++ = 0;    /* each signature has a string for two block sizes. We now     choose how to combine the two block sizes. We checked above     that they have at least one block size in common */  if (block_size1 == block_size2) {    uint32_t score1, score2;    score1 = score_strings(s1_1, s2_1, block_size1);    score2 = score_strings(s1_2, s2_2, block_size2);    s->block_size = block_size1;    score = MAX(score1, score2);  } else if (block_size1 == block_size2*2) {    score = score_strings(s1_1, s2_2, block_size1);    s->block_size = block_size1;  } else {    score = score_strings(s1_2, s2_1, block_size2);    s->block_size = block_size2;  }    free(s1);  free(s2);    return score;}int hash_file(state *s, char *fn){  size_t fn_length;  char *sum, *msg, *my_filename;  FILE *handle;    if ((handle = fopen(fn,"rb")) == NULL)  {    print_error(s,fn,strerror(errno));    return TRUE;  }   if ((sum = (char *)malloc(sizeof(char) * MAX_RESULT)) == NULL)  {    fclose(handle);    print_error(s,fn,"out of memory");    return TRUE;  }  if ((msg = (char *)malloc(sizeof(char) * 80)) == NULL)  {    free(sum);    fclose(handle);    print_error(s,fn,"out of memory");    return TRUE;  }#define CUTOFF_LENGTH   78  if (MODE(mode_verbose))  {    fn_length = strlen(fn);    if (fn_length > CUTOFF_LENGTH)    {      // We have to make a duplicate of the string to call basename on it      // We need the original name for the output later on      my_filename = strdup(fn);      my_basename(my_filename);    }    else      my_filename = fn;    /* So that the message is zero-terminated, we set the last char       to zeros and make sure that we don't write to the last character */    msg[CUTOFF_LENGTH-1] = 0;    snprintf(msg,CUTOFF_LENGTH-1,"Hashing: %s", my_filename);    fprintf(stderr,"%s\r", msg);    if (fn_length > CUTOFF_LENGTH)      free(my_filename);  }  ss_compute(handle,sum);  prepare_filename(s,fn);  if (MODE(mode_match_pretty))  {    if (match_add(s,fn,sum))      print_error(s,fn,"Unable to add hash to set of known hashes");  }  else if (MODE(mode_match) || MODE(mode_directory))  {    match_compare(s,fn,sum);    if (MODE(mode_directory))      if (match_add(s,fn,sum))	print_error(s,fn,"Unable to add hash to set of known hashes");  }  else  {    if (s->first_file_processed)    {      printf ("%s%s", OUTPUT_FILE_HEADER,NEWLINE);      s->first_file_processed = FALSE;    }    printf ("%s,\"%s\"%s", sum, fn, NEWLINE);  }  fclose(handle);  free(sum);  free(msg);  return FALSE;}
上一页 12
💿 文件大小 66 K
👤 上传用户 yangjiuhe
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#Linux #检测 #相似性 #程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -