📄 fstrcmp.c

📁 此程序是关于字符串模糊相似度的计算程序（C语言）
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
		      int k;		      for (k = 1; xv[x - k] == yv[y - k]; k++)			{			  if (k == SNAKE_LIMIT)			    {			      best = v;			      part->xmid = x;			      part->ymid = y;			      break;			    }			}		    }		}	    }	  if (best > 0)	    {	      part->lo_minimal = 1;	      part->hi_minimal = 0;	      return 2 * c - 1;	    }	  best = 0;	  for (d = bmax; d >= bmin; d -= 2)	    {	      int dd;	      int x;	      int y;	      int v;	      dd = d - bmid;	      x = bd[d];	      y = x - d;	      v = (xlim - x) * 2 + dd;	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))		{		  if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&		      yoff < y && y <= ylim - SNAKE_LIMIT)		    {		      /* We have a good enough best diagonal; now insist			 that it end with a significant snake.  */		      int k;		      for (k = 0; xv[x + k] == yv[y + k]; k++)			{			  if (k == SNAKE_LIMIT - 1)			    {			      best = v;			      part->xmid = x;			      part->ymid = y;			      break;			    }			}		    }		}	    }	  if (best > 0)	    {	      part->lo_minimal = 0;	      part->hi_minimal = 1;	      return 2 * c - 1;	    }	}#endif /* MINUS_H_FLAG */      /* Heuristic: if we've gone well beyond the call of duty, give up	 and report halfway between our best results so far.  */      if (c >= too_expensive)	{	  int fxybest;	  int fxbest;	  int bxybest;	  int bxbest;	  /* Pacify `gcc -Wall'. */	  fxbest = 0;	  bxbest = 0;	  /* Find forward diagonal that maximizes X + Y.  */	  fxybest = -1;	  for (d = fmax; d >= fmin; d -= 2)	    {	      int x;	      int y;	      x = fd[d] < xlim ? fd[d] : xlim;	      y = x - d;	      if (ylim < y)		{		  x = ylim + d;		  y = ylim;		}	      if (fxybest < x + y)		{		  fxybest = x + y;		  fxbest = x;		}	    }	  /* Find backward diagonal that minimizes X + Y.  */	  bxybest = INT_MAX;	  for (d = bmax; d >= bmin; d -= 2)	    {	      int x;	      int y;	      x = xoff > bd[d] ? xoff : bd[d];	      y = x - d;	      if (y < yoff)		{		  x = yoff + d;		  y = yoff;		}	      if (x + y < bxybest)		{		  bxybest = x + y;		  bxbest = x;		}	    }	  /* Use the better of the two diagonals.  */	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))	    {	      part->xmid = fxbest;	      part->ymid = fxybest - fxbest;	      part->lo_minimal = 1;	      part->hi_minimal = 0;	    }	  else	    {	      part->xmid = bxbest;	      part->ymid = bxybest - bxbest;	      part->lo_minimal = 0;	      part->hi_minimal = 1;	    }	  return 2 * c - 1;	}    }}/* NAME	compareseq - find edit sequence   SYNOPSIS	void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);   DESCRIPTION	Compare in detail contiguous subsequences of the two strings	which are known, as a whole, to match each other.	The subsequence of string 0 is [XOFF, XLIM) and likewise for	string 1.	Note that XLIM, YLIM are exclusive bounds.  All character	numbers are origin-0.	If MINIMAL is nonzero, find a minimal difference no matter how	expensive it is.  */static void compareseq PARAMS ((int, int, int, int, int));static voidcompareseq (xoff, xlim, yoff, ylim, minimal)     int xoff;     int xlim;     int yoff;     int ylim;     int minimal;{  const char *const xv = string[0].data;	/* Help the compiler.  */  const char *const yv = string[1].data;  if (string[1].edit_count + string[0].edit_count > max_edits)    return;  /* Slide down the bottom initial diagonal. */  while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])    {      ++xoff;      ++yoff;    }  /* Slide up the top initial diagonal. */  while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])    {      --xlim;      --ylim;    }  /* Handle simple cases. */  if (xoff == xlim)    {      while (yoff < ylim)	{	  ++string[1].edit_count;	  ++yoff;	}    }  else if (yoff == ylim)    {      while (xoff < xlim)	{	  ++string[0].edit_count;	  ++xoff;	}    }  else    {      int c;      struct partition part;      /* Find a point of correspondence in the middle of the strings.  */      c = diag (xoff, xlim, yoff, ylim, minimal, &part);      if (c == 1)	{#if 0	  /* This should be impossible, because it implies that one of	     the two subsequences is empty, and that case was handled	     above without calling `diag'.  Let's verify that this is	     true.  */	  abort ();#else	  /* The two subsequences differ by a single insert or delete;	     record it and we are done.  */	  if (part.xmid - part.ymid < xoff - yoff)	    ++string[1].edit_count;	  else	    ++string[0].edit_count;#endif	}      else	{	  /* Use the partitions to split this problem into subproblems.  */	  compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);	  compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);	}    }}/* NAME	fstrcmp - fuzzy string compare   SYNOPSIS	double fstrcmp(const char *, const char *, double);   DESCRIPTION	The fstrcmp function may be used to compare two string for	similarity.  It is very useful in reducing "cascade" or	"secondary" errors in compilers or other situations where	symbol tables occur.   RETURNS	double; 0 if the strings are entirly dissimilar, 1 if the	strings are identical, and a number in between if they are	similar.  */doublefstrcmp (const char *string1, const char *string2, double minimum){  int i;  size_t fdiag_len;  static int *fdiag_buf;  static size_t fdiag_max;  /* set the info for each string.  */  string[0].data = string1;  string[0].data_length = strlen (string1);  string[1].data = string2;  string[1].data_length = strlen (string2);  /* short-circuit obvious comparisons */  if (string[0].data_length == 0 && string[1].data_length == 0)    return 1.0;  if (string[0].data_length == 0 || string[1].data_length == 0)    return 0.0;  /* Set TOO_EXPENSIVE to be approximate square root of input size,     bounded below by 256.  */  too_expensive = 1;  for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)    too_expensive <<= 1;  if (too_expensive < 256)    too_expensive = 256;  /* Because fstrcmp is typically called multiple times, while scanning     symbol tables, etc, attempt to minimize the number of memory     allocations performed.  Thus, we use a static buffer for the     diagonal vectors, and never free them.  */  fdiag_len = string[0].data_length + string[1].data_length + 3;  if (fdiag_len > fdiag_max)    {      fdiag_max = fdiag_len;      fdiag_buf = realloc (fdiag_buf, fdiag_max * (2 * sizeof (int)));    }  fdiag = fdiag_buf + string[1].data_length + 1;  bdiag = fdiag + fdiag_len;  max_edits = 1 + (string[0].data_length + string[1].data_length) * (1. - minimum);  /* Now do the main comparison algorithm */  string[0].edit_count = 0;  string[1].edit_count = 0;  compareseq (0, string[0].data_length, 0, string[1].data_length, 0);  /* The result is	((number of chars in common) / (average length of the strings)).     This is admittedly biased towards finding that the strings are     similar, however it does produce meaningful results.  */  return ((double)             (string[0].data_length + string[1].data_length - string[1].edit_count - string[0].edit_count)           / (string[0].data_length + string[1].data_length));}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -