⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 edit_dist.c

📁 检测文件的相似性的一个小程序!使用Linux系统编写
💻 C
字号:
/*  This edit distance code is taken from trn3.6. A few minor  modifications have been made by Andrew Tridgell <tridge@samba.org>  for use in spamsum.*//***************************************************************************//* The authors make no claims as to the fitness or correctness of this software * for any use whatsoever, and it is provided as is. Any use of this software * is at the user's own risk.  */#include <stdio.h>#include <unistd.h>#include <stdlib.h>/* edit_dist -- returns the minimum edit distance between two strings	Program by:  Mark Maimone   CMU Computer Science   13 Nov 89	Last Modified:  28 Jan 90   If the input strings have length n and m, the algorithm runs in time   O(nm) and space O(min(m,n)).HISTORY   13 Nov 89 (mwm) Created edit_dist() and set_costs().   28 Jan 90 (mwm) Added view_costs().  Should verify that THRESHOLD   computations will work even when THRESHOLD is not a multiple of   sizeof(int).   17 May 93 (mwm) Improved performance when used with trn's newsgroup   processing; assume all costs are 1, and you can terminate when a   threshold is exceeded.*/#define MIN_DIST 100#define	TRN_SPEEDUP		/* Use a less-general version of the				   routine, one that's better for trn.				   All change costs are 1, and it's okay				   to terminate if the edit distance is				   known to exceed MIN_DIST */#define THRESHOLD 4000		/* worry about allocating more memory only				   when this # of bytes is exceeded */#define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))#define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))#define swap_int(x,y)  (_iswap = (x), (x) = (y), (y) = _iswap)#define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)#define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my))#define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))static int insert_cost = 1;static int delete_cost = 1;#ifndef TRN_SPEEDUPstatic int change_cost = 1;static int swap_cost   = 1;#endifstatic int _iswap;			/* swap_int temp variable */static char *_cswap;			/* swap_char temp variable */static int _mx, _my, _mz;		/* min2, min3 temp variables *//* edit_distn -- returns the edit distance between two strings, or -1 on   failure */intedit_distn(from, from_len, to, to_len)char *from, *to;register int from_len, to_len;{#ifndef TRN_SPEEDUP    register int ins, del, ch;	  	/* local copies of edit costs */#endif    register int row, col, index;	/* dynamic programming counters */    register int radix;			/* radix for modular indexing */#ifdef TRN_SPEEDUP    register int low;#endif    int *buffer;			/* pointer to storage for one row					   of the d.p. array */    static int store[THRESHOLD / sizeof (int)];					/* a small amount of static					   storage, to be used when the					   input strings are small enough *//* Handle trivial cases when one string is empty */    if (from == NULL || !from_len)	if (to == NULL || !to_len)	    return 0;	else	    return to_len * insert_cost;    else if (to == NULL || !to_len)	return from_len * delete_cost;/* Initialize registers */    radix = 2 * from_len + 3;#ifdef TRN_SPEEDUP#define ins 1#define del 1#define ch 3#define swap_cost 5#else    ins  = insert_cost;    del  = delete_cost;    ch   = change_cost;#endif/* Make   from   short enough to fit in the static storage, if it's at all   possible */    if (from_len > to_len && from_len > STRLENTHRESHOLD) {	swap_int(from_len, to_len);	swap_char(from, to);#ifndef TRN_SPEEDUP	swap_int(ins, del);#endif    } /* if from_len > to_len *//* Allocate the array storage (from the heap if necessary) */    if (from_len <= STRLENTHRESHOLD)	buffer = store;    else	buffer = (int *) malloc(radix * sizeof (int));/* Here's where the fun begins.  We will find the minimum edit distance   using dynamic programming.  We only need to store two rows of the matrix   at a time, since we always progress down the matrix.  For example,   given the strings "one" and "two", and insert, delete and change costs   equal to 1:	   _  o  n  e	_  0  1  2  3	t  1  1  2  3	w  2  2  2  3	o  3  2  3  3   The dynamic programming recursion is defined as follows:	ar(x,0) := x * insert_cost	ar(0,y) := y * delete_cost	ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),		       a(x - 1, y) + insert_cost,		       a(x, y - 1) + delete_cost,		       a(x - 2, y - 2) + (from[x] == to[y-1] &&					  from[x-1] == to[y] ? swap_cost :					  infinity))   Since this only looks at most two rows and three columns back, we need   only store the values for the two preceeding rows.  In this   implementation, we do not explicitly store the zero column, so only 2 *   from_len + 2   words are needed.  However, in the implementation of the   swap_cost   check, the current matrix value is used as a buffer; we   can't overwrite the earlier value until the   swap_cost   check has   been performed.  So we use   2 * from_len + 3   elements in the buffer.*/#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \	buffer[mod(index)]))#define NW(x,y)	  ar(x, y, index + from_len + 2)#define N(x,y)	  ar(x, y, index + from_len + 3)#define W(x,y)	  ar(x, y, index + radix - 1)#define NNWW(x,y) ar(x, y, index + 1)#define mod(x) ((x) % radix)    index = 0;#ifdef DEBUG_EDITDIST    printf("      ");    for (col = 0; col < from_len; col++)	printf(" %c ", from[col]);    printf("\n   ");    for (col = 0; col <= from_len; col++)	printf("%2d ", col * del);#endif/* Row 0 is handled implicitly; its value at a given column is   col*del.   The loop below computes the values for Row 1.  At this point we know the   strings are nonempty.  We also don't need to consider swap costs in row   1.   COMMENT:  the indicies   row and col   below point into the STRING, so   the corresponding MATRIX indicies are   row+1 and col+1.*/    buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));#ifdef TRN_SPEEDUP    low = buffer[mod(index + radix - 1)];#endif#ifdef DEBUG_EDITDIST    printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);#endif    for (col = 1; col < from_len; col++) {	buffer[index] = min3(		col * del + ((from[col] == to[0]) ? 0 : ch),		(col + 1) * del + ins,		buffer[index - 1] + del);#ifdef TRN_SPEEDUP	if (buffer[index] < low)	    low = buffer[index];#endif	index++;#ifdef DEBUG_EDITDIST	printf("%2d ", buffer[index - 1]);#endif    } /* for col = 1 */#ifdef DEBUG_EDITDIST    printf("\n %c %2d ", to[1], 2 * ins);#endif/* Now handle the rest of the matrix */    for (row = 1; row < to_len; row++) {	for (col = 0; col < from_len; col++) {	    buffer[index] = min3(		    NW(row, col) + ((from[col] == to[row]) ? 0 : ch),		    N(row, col + 1) + ins,		    W(row + 1, col) + del);	    if (from[col] == to[row - 1] && col > 0 &&		    from[col - 1] == to[row])		    		buffer[index] = min2(buffer[index],			NNWW(row - 1, col - 1) + swap_cost);#ifdef DEBUG_EDITDIST	    printf("%2d ", buffer[index]);#endif#ifdef TRN_SPEEDUP	    if (buffer[index] < low || col == 0)		low = buffer[index];#endif	    index = mod(index + 1);	} /* for col = 1 */#ifdef DEBUG_EDITDIST	if (row < to_len - 1)	    printf("\n %c %2d ", to[row+1], (row + 2) * ins);	else	    printf("\n");#endif#ifdef TRN_SPEEDUP	if (low > MIN_DIST)	    break;#endif    } /* for row = 1 */    row = buffer[mod(index + radix - 1)];    if (buffer != store)	free((char *) buffer);    return row;} /* edit_distn */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -