📄 diff1.c
字号:
/* diff - differential file comparison** Uses an algorithm due to Harold Stone, which finds* a pair of longest identical subsequences in the two* files.** The major goal is to generate the match vector J.* J[i] is the index of the line in file1 corresponding* to line i file0. J[i] = 0 if there is no* such line in file1.** Lines are hashed so as to work in core. All potential* matches are located by sorting the lines of each file* on the hash (called value_____). In particular, this* collects the equivalence classes in file1 together.* Subroutine equiv____ replaces the value of each line in* file0 by the index of the first element of its * matching equivalence in (the reordered) file1.* To save space equiv_____ squeezes file1 into a single* array member______ in which the equivalence classes* are simply concatenated, except that their first* members are flagged by changing sign.** Next the indices that point into member______ are unsorted_______ into* array class_____ according to the original order of file0.** The cleverness lies in routine stone______. This marches* through the lines of file0, developing a vector klist_____* of "k-candidates". At step i a k-candidate is a matched* pair of lines x,y (x in file0 y in file1) such that* there is a common subsequence of lenght k* between the first i lines of file0 and the first y * lines of file1, but there is no such subsequence for* any smaller y. x is the earliest possible mate to y* that occurs in such a subsequence.** Whenever any of the members of the equivalence class of* lines in file1 matable to a line in file0 has serial number * less than the y of some k-candidate, that k-candidate * with the smallest such y is replaced. The new * k-candidate is chained (via pred____) to the current* k-1 candidate so that the actual subsequence can* be recovered. When a member has serial number greater* that the y of all k-candidates, the klist is extended.* At the end, the longest subsequence is pulled out* and placed in the array J by unravel_______.** With J in hand, the matches there recorded are* check_____ed against reality to assure that no spurious* matches have crept in due to hashing. If they have,* they are broken, and "jackpot " is recorded--a harmless* matter except that a true match for a spuriously* mated line may now be unnecessarily reported as a change.** Much of the complexity of the program comes simply* from trying to minimize core utilization and* maximize the range of doable problems by dynamically* allocating what is needed and reusing what is not.* The core requirements for problems larger than somewhat* are (in words) 2*length(file0) + length(file1) +* 3*(number of k-candidates installed), typically about* 6n words for files of length n. There is also space for buf1* used which could, by moving data underfoot and reallocating* buf1 together with buf2, be completely overlaid.*/struct buf { int fdes; char data[516];} *buf1, *buf2;struct cand { int x; int y; struct cand *pred;} cand;struct line { int serial; int value;} *file[2], line;int len[2];int *class; /*will be overlaid on file[0]*/int *member; /*will be overlaid on file[1]*/struct cand **klist; /*will be overlaid on file[0] after class*/int *J; /*will be overlaid on class*/int *ixold; /*will be overlaid on klist*/int *ixnew; /*will be overlaid on file[1]*/char *area;char *top;alloc(n){ register char *p; p = area; n = (n+1) & ~1; area =+ n; while(area > top) { if(sbrk(1024) == -1) { mesg("Out of space\n"); exit(1); } top =+ 1024; } return(p);}mesg(s)char *s;{ while(*s) write(2,s++,1);}sort(a,n) /*shellsort CACM #201*/struct line *a;{ struct line w; register int j,m; struct line *ai; register struct line *aim; int k; for(j=1;j<=n;j=* 2) m = 2*j - 1; for(m=/2;m!=0;m=/2) { k = n-m; for(j=1;j<=k;j++) { for(ai = &a[j]; ai > a; ai =- m) { aim = &ai[m]; if(aim->value > ai[0].value || aim->value == ai[0].value && aim->serial > ai[0].serial) break; w.value = ai[0].value; ai[0].value = aim->value; aim->value = w.value; w.serial = ai[0].serial; ai[0].serial = aim->serial; aim->serial = w.serial; } } }}unsort(f, l, b)struct line *f;int *b;{ int *a; int i; a = alloc((l+1)*sizeof(a[0])); for(i=1;i<=l;i++) a[f[i].serial] = f[i].value; for(i=1;i<=l;i++) b[i] = a[i]; area = a;}prepare(i, arg)char *arg;{ register char *temp; temp = file[i] = area; alloc(sizeof(line)); input(arg); len[i] = (area - temp)/sizeof(line) - 1; alloc(sizeof(line)); sort(file[i], len[i]);}input(arg){ register int h, i; register struct line *p; if(fopen(arg,buf1) == -1) { mesg("Cannot open "); mesg(arg); mesg("\n"); exit(1); } for(i=0; h=readhash(buf1);) { p = alloc(sizeof(line)); p->serial = ++i; p->value = h; } close(buf1->fdes);}equiv(a,n,b,m,c)struct line *a, *b;int *c;{ register int i, j; i = j = 1; while(i<=n && j<=m) { if(a[i].value <b[j].value) a[i++].value = 0; else if(a[i].value == b[j].value) a[i++].value = j; else j++; } while(i <= n) a[i++].value = 0; b[m+1].value = 0; j = 0; while(++j <= m) { c[j] = -b[j].serial; while(b[j+1].value == b[j].value) { j++; c[j] = b[j].serial; } } c[j] = -1;}main(argc, argv)char **argv;{ int k; if(argc>1 && *argv[1]=='-') { argc--; argv++; } if(argc!=3) { mesg("Arg count\n"); exit(1); } area = top = sbrk(0); buf1 = alloc(sizeof(*buf1)); prepare(0, argv[1]); prepare(1, argv[2]); member = file[1]; equiv(file[0], len[0], file[1], len[1], member); class = file[0]; unsort(file[0], len[0], class); klist = &class[len[0]+2]; area = &member[len[1]+2]; k = stone(class, len[0], member, klist); J = class; unravel(klist[k]); ixold = klist; ixnew = file[1]; area = &ixnew[len[1]+2]; buf2 = alloc(sizeof(*buf2)); if(check(argv)) mesg("Jackpot\n"); output(argv);}stone(a,n,b,c)int *a;int *b;struct cand **c;{ register int i, k,y; int j, l; int skip; k = 0; c[0] = 0; for(i=1; i<=n; i++) { j = a[i]; if(j==0) continue; skip = 0; do { y = b[j]; if(y<0) y = -y; if(skip) continue; l = search(c, k, y); if(l > k) { c[k+1] = newcand(i,y,c[k]); skip = 1; k++; } else if(c[l]->y > y && c[l]->x < i) c[l] = newcand(i,y,c[l-1]); } while(b[++j] > 0); } return(k);}struct cand *newcand(x,y,pred)struct cand *pred;{ struct cand *p; p = alloc(sizeof(cand)); p->x = x; p->y = y; p->pred = pred; return(p);}search(c, k, y)struct cand **c;{ register int i, j, l; int t; i = 0; j = k+1; while((l=(i+j)/2) > i) { t = c[l]->y; if(t > y) j = l; else if(t < y) i = l; else return(l); } return(l+1);}unravel(p)struct cand *p;{ int i; for(i=0; i<=len[0]; i++) J[i] = 0; while(p) { J[p->x] = p->y; p = p->pred; }}/* check does double duty:1. ferret out any fortuitous correspondences dueto counfounding by hashing (which result in "jackpot")2. collect random access indexes to the two files */check(argv)char **argv;{ register int i, j; int ctold, ctnew; int jackpot; char c,d; fopen(argv[1],buf1); fopen(argv[2],buf2); j = 1; ctold = ctnew = 0; ixold[0] = ixnew[0] = 0; jackpot = 0; for(i=1;i<=len[0];i++) { if(J[i]==0) { while(getc(buf1)!='\n') ctold++; ixold[i] = ++ctold; continue; } while(j<J[i]) { while(getc(buf2)!='\n') ctnew++; ixnew[j] = ++ctnew; j++; } while((c=getc(buf1))==(d=getc(buf2))) { if(c=='\n') break; ctold++; ctnew++; } while(c!='\n') { jackpot++; J[i] = 0; c = getc(buf1); ctold++; } ixold[i] = ++ctold; while(d!='\n') { jackpot++; J[i] = 0; d = getc(buf2); ctnew++; } ixnew[j] = ++ctnew; j++; } for(;j<=len[1];j++) { while(getc(buf2)!='\n') ctnew++; ixnew[j] = ++ctnew; } close(buf1->fdes); close(buf2->fdes); return(jackpot);}output(argv)char **argv;{ int dir; int m; int i0,i1,j0,j1; extern fout; dir = **argv=='-'; fout = dup(1); buf1->fdes = open(argv[1],0); buf2->fdes = open(argv[2],0); m = len[0]; J[0] = 0; J[m+1] = len[1]+1; if(dir==0) for(i0=1;i0<=m;i0=i1+1) { while(i0<=m&&J[i0]==J[i0-1]+1) i0++; j0 = J[i0-1]+1; i1 = i0-1; while(i1<m&&J[i1+1]==0) i1++; j1 = J[i1+1]-1; J[i1] = j1; change(i0,i1,j0,j1,dir); } else for(i0=m;i0>=1;i0=i1-1) { while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--; j0 = J[i0+1]-1; i1 = i0+1; while(i1>1&&J[i1-1]==0) i1--; j1 = J[i1-1]+1; J[i1] = j1; change(i1,i0,j1,j0,dir); } if(m==0) change(1,0,1,len[1],dir); flush();}change(a,b,c,d,dir){ if(a>b&&c>d) return; range(a,b); putchar(a>b?'a':c>d?'d':'c'); if(dir==0) range(c,d); putchar('\n'); if(dir==0) { fetch(ixold,a,b,buf1,"* "); if(a<=b&&c<=d) printf("---\n"); } fetch(ixnew,c,d,buf2,dir==0?". ":""); if(dir!=0&&c<=d) printf(".\n");}range(a,b){ if(a>b) printf("%d",b); if(a<=b) printf("%d",a); if(a<b) printf(",%d",b);}fetch(f,a,b,lb,pref)int *f;struct buf *lb;char *pref;{ int i, j; int nc; for(i=a;i<=b;i++) { seek(lb->fdes,f[i-1],0); nc = read(lb->fdes,lb->data,f[i]-f[i-1]); printf(pref); for(j=0;j<nc;j++) putchar(lb->data[j]); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -