📄 leditdist.cpp
字号:
#include "leditdist.hpp"// The basic algorithm is as follows://// Let A[n] represent the nth character of string n// A[n..] represent the substring of A starting at n// if n > length of A then it is considered an empty string//// edit_distance(A,B,limit) = ed(A,B,0)// where ed(A,B,d) = d if A & B is empty.// = infinity if d > limit// = ed(A[2..],B[2..], d) if A[1] == B[1] // = min ( ed(A[2..],B[2..], d+1), // ed(A, B[2..], d+1),// ed(A[2..],B, d+1) ) otherwise//// However, the code below:// 1) Also allows for swaps// 2) Allow weights to be attached to each edit// 3) Is not recursive, it uses a loop when it is tail recursion// and a small stack otherwise. The stack will NEVER be larger// then 2 * limit.// 4) Is extremely optimized#define check_rest(a,b,s) \ a0 = a; b0 = b; \ while (*a0 == *b0) { \ if (*a0 == '\0') { \ if (s < min) min = s; \ break; \ } \ ++a0; ++b0; \ }namespace aspeller { int limit_edit_distance(const char * a, const char * b, int limit, const EditDistanceWeights & w) { limit = limit*w.max; static const int size = 10; struct Edit { const char * a; const char * b; int score; }; Edit begin[size]; Edit * i = begin; const char * a0; const char * b0; int score = 0; int min = LARGE_NUM; while (true) { while (*a == *b) { if (*a == '\0') { if (score < min) min = score; goto FINISH; } ++a; ++b; } if (*a == '\0') { do { score += w.del2; if (score >= min) goto FINISH; ++b; } while (*b != '\0'); min = score; } else if (*b == '\0') { do { score += w.del1; if (score >= min) goto FINISH; ++a; } while (*a != '\0'); min = score; } else { if (score + w.max <= limit) { if (limit*w.min <= w.max*(w.min+score)) { // if floor(score/max)=limit/max-1 then this edit is only good // if it makes the rest of the string match. So check if // the rest of the string matches to avoid the overhead of // pushing it on then off the stack // delete a character from a check_rest(a+1,b,score + w.del1); // delete a character from b check_rest(a,b+1,score + w.del2); if (*a == *(b+1) && *b == *(a+1)) { // swap two characters check_rest(a+2,b+2, score + w.swap); } else { // substitute one character for another which is the same // thing as deleting a character from both a & b check_rest(a+1,b+1, score + w.sub); } } else { // delete a character from a i->a = a + 1; i->b = b; i->score = score + w.del1; ++i; // delete a character from b i->a = a; i->b = b + 1; i->score = score + w.del2; ++i; // If two characters can be swapped and make a match // then the substitution is pointless. // Also, there is no need to push this on the stack as // it is going to be imminently removed. if (*a == *(b+1) && *b == *(a+1)) { // swap two characters a = a + 2; b = b + 2; score += w.swap; continue; } else { // substitute one character for another which is the same // thing as deleting a character from both a & b a = a + 1; b = b + 1; score += w.sub; continue; } } } } FINISH: if (i == begin) return min; --i; a = i->a; b = i->b; score = i->score; } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -