📄 engine.c
字号:
/* now for each offset in the second string compute the rolling hash and compare it to all of the rolling hashes for the first string. If one matches then we have a candidate substring match. We then confirm that match with a direct string comparison */ for (i=0;s2[i];i++) { uint32_t h = roll_hash((uchar)s2[i]); if (i < ROLLING_WINDOW-1) continue; for (j=ROLLING_WINDOW-1;j<num_hashes;j++) {// printf("hashes[%d]=%d and %d ",j,hashes[j],h);////////////9.24 if (hashes[j] != 0 && hashes[j] == h) {// printf("test 21 \n");////////9.21号 /* we have a potential match - confirm it */ if (strlen(s2+i-(ROLLING_WINDOW-1)) >= ROLLING_WINDOW && strncmp(s2+i-(ROLLING_WINDOW-1), s1+j-(ROLLING_WINDOW-1), ROLLING_WINDOW) == 0) { return 1; } } } } return 0;}/* eliminate sequences of longer than 3 identical characters. These sequences contain very little information so they tend to just bias the result unfairly*/static char *eliminate_sequences(const char *str){ char *ret; int i, j, len; ret = strdup(str); if (!ret) return NULL; len = strlen(str); for (i=j=3;i<len;i++) { if (str[i] != str[i-1] || str[i] != str[i-2] || str[i] != str[i-3]) { ret[j++] = str[i]; } } ret[j] = 0; return ret;}/* this is the low level string scoring algorithm. It takes two strings and scores them on a scale of 0-100 where 0 is a terrible match and 100 is a great match. The block_size is used to cope with very small messages.*/static unsigned score_strings(const char *s1, const char *s2, uint32_t block_size){ uint32_t score; uint32_t len1, len2; int edit_distn(const char *from, int from_len, const char *to, int to_len); len1 = strlen(s1); len2 = strlen(s2); if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) { /* not a real spamsum signature? */ return 0; } /* the two strings must have a common substring of length ROLLING_WINDOW to be candidates */// char string1[]="m2AuCDzuaEWrZeDBNNQtWjcZud5jyzpWeop40cdf";// char string2[]="m2AuCOzuaEWrZeDBKNQtWjjZud5jyepWeop4dcdf"; if (has_common_substring(s1, s2) == 0) { printf("\nthe string1 and string2 has no comman substring!\n"); return 0; } /* compute the edit distance between the two strings. The edit distance gives us a pretty good idea of how closely related the two strings are */ score = edit_distn(s1, len1, s2, len2); printf("s1 and s2 edit_distance is %d\n",score); /* scale the edit distance by the lengths of the two strings. This changes the score to be a measure of the proportion of the message that has changed rather than an absolute quantity. It also copes with the variability of the string lengths. */ score = (score * SPAMSUM_LENGTH) / (len1 + len2); /* at this stage the score occurs roughly on a 0-64 scale, * with 0 being a good match and 64 being a complete * mismatch */ /* rescale to a 0-100 scale (friendlier to humans) */ score = (100 * score) / 64; /* it is possible to get a score above 100 here, but it is a really terrible match */ if (score >= 100) return 0; /* now re-scale on a 0-100 scale with 0 being a poor match and 100 being a excellent match. */ score = 100 - score; // printf ("len1: %"PRIu32" len2: %"PRIu32"\n", len1, len2); /* when the blocksize is small we don't want to exaggerate the match size */ if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) { score = block_size/MIN_BLOCKSIZE * MIN(len1, len2); } return score;}/* given two spamsum strings return a value indicating the degree to which they match.*/uint32_t spamsum_match(state *s, const char *str1, const char *str2){ uint32_t block_size1, block_size2; uint32_t score = 0; char *s1, *s2; char *s1_1, *s1_2; char *s2_1, *s2_2; /* each spamsum is prefixed by its block size */ if (sscanf(str1, "%u:", &block_size1) != 1 || sscanf(str2, "%u:", &block_size2) != 1) { return 0; } /* if the blocksizes don't match then we are comparing apples to oranges ... */ if (block_size1 != block_size2 && block_size1 != block_size2*2 && block_size2 != block_size1*2) { return 0; } /* move past the prefix */ str1 = strchr(str1, ':'); str2 = strchr(str2, ':'); if (!str1 || !str2) { /* badly formed ... */ return 0; } /* there is very little information content is sequences of the same character like 'LLLLL'. Eliminate any sequences longer than 3. This is especially important when combined with the has_common_substring() test below. */ s1 = eliminate_sequences(str1+1); s2 = eliminate_sequences(str2+1); if (!s1 || !s2) return 0; /* now break them into the two pieces */ s1_1 = s1; s2_1 = s2; s1_2 = strchr(s1, ':'); s2_2 = strchr(s2, ':'); if (!s1_2 || !s2_2) { /* a signature is malformed - it doesn't have 2 parts */ free(s1); free(s2); return 0; } *s1_2++ = 0; *s2_2++ = 0; /* each signature has a string for two block sizes. We now choose how to combine the two block sizes. We checked above that they have at least one block size in common */ if (block_size1 == block_size2) { uint32_t score1, score2; score1 = score_strings(s1_1, s2_1, block_size1); score2 = score_strings(s1_2, s2_2, block_size2); s->block_size = block_size1; score = MAX(score1, score2); } else if (block_size1 == block_size2*2) { score = score_strings(s1_1, s2_2, block_size1); s->block_size = block_size1; } else { score = score_strings(s1_2, s2_1, block_size2); s->block_size = block_size2; } free(s1); free(s2); return score;}int hash_file(state *s, char *fn){ size_t fn_length; char *sum, *msg, *my_filename; FILE *handle; if ((handle = fopen(fn,"rb")) == NULL) { print_error(s,fn,strerror(errno)); return TRUE; } if ((sum = (char *)malloc(sizeof(char) * MAX_RESULT)) == NULL) { fclose(handle); print_error(s,fn,"out of memory"); return TRUE; } if ((msg = (char *)malloc(sizeof(char) * 80)) == NULL) { free(sum); fclose(handle); print_error(s,fn,"out of memory"); return TRUE; }#define CUTOFF_LENGTH 78 if (MODE(mode_verbose)) { fn_length = strlen(fn); if (fn_length > CUTOFF_LENGTH) { // We have to make a duplicate of the string to call basename on it // We need the original name for the output later on my_filename = strdup(fn); my_basename(my_filename); } else my_filename = fn; /* So that the message is zero-terminated, we set the last char to zeros and make sure that we don't write to the last character */ msg[CUTOFF_LENGTH-1] = 0; snprintf(msg,CUTOFF_LENGTH-1,"Hashing: %s", my_filename); fprintf(stderr,"%s\r", msg); if (fn_length > CUTOFF_LENGTH) free(my_filename); } ss_compute(handle,sum); prepare_filename(s,fn); if (MODE(mode_match_pretty)) { if (match_add(s,fn,sum)) print_error(s,fn,"Unable to add hash to set of known hashes"); } else if (MODE(mode_match) || MODE(mode_directory)) { match_compare(s,fn,sum); if (MODE(mode_directory)) if (match_add(s,fn,sum)) print_error(s,fn,"Unable to add hash to set of known hashes"); } else { if (s->first_file_processed) { printf ("%s%s", OUTPUT_FILE_HEADER,NEWLINE); s->first_file_processed = FALSE; } printf ("%s,\"%s\"%s", sum, fn, NEWLINE); } fclose(handle); free(sum); free(msg); return FALSE;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -