📄 merge.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*** Copyright (C) 1995, Enterprise Integration Technologies Corp.        ** All Rights Reserved.** Kevin Hughes, kevinh@eit.com ** 3/11/94**** Released under the GPL by EIT**** Heavily hacked for Harvest*/#include "swish.h"#include "merge.h"#include "error.h"#include "file.h"#include "search.h"#include "string.h"#include "index.h"#include "hash.h"#include "mem.h"/* The main merge functions - it accepts three file names.** This is a bit hairy. It basically acts as a zipper,** zipping up both index files into one.*/void readmerge(file1, file2, outfile)     char *file1;     char *file2;     char *outfile;{	int i, j, indexfilenum1, indexfilenum2, result, totalfiles,		skipwords, skipfiles;	long limit1, limit2, fileinfo1, fileinfo2, offsetstart;	char line[MAXSTRLEN];	struct indexentry *ip1, *ip2, *ip3;	struct indexentry *buffer1, *buffer2;	FILE *fp1, *fp2, *fp3;	struct metaMergeEntry *metaFile1, *metaFile2;	/* remapVar is used into addindexfilelist and need to be	** initialized each time two indexes are merged.	*/	remapVar = 0;	metaFile1 = metaFile2 = NULL;	if ((fp1 = fopen(file1, "r")) == NULL) {		sprintf(errorstr, "Couldn't read the index file \"%s\".",		file1);		progerr(errorstr);	}	if (!isokindexheader(fp1)) {		sprintf(errorstr, "\"%s\" has an unknown format.",		file1);		progerr(errorstr);	}	if ((fp2 = fopen(file2, "r")) == NULL) {		sprintf(errorstr, "Couldn't read the index file \"%s\".",		file2);		progerr(errorstr);	}	if (!isokindexheader(fp2)) {		sprintf(errorstr, "\"%s\" has an unknown format.",		file2);		progerr(errorstr);	}	ip1 = ip2 = ip3 = NULL;	buffer1 = buffer2 = NULL;	if (verbose)		printf("Counting files... ");	indexfilenum1 = getindexfilenum(fp1);	indexfilenum2 = getindexfilenum(fp2);	totalfiles = indexfilenum1 + indexfilenum2;	if (verbose) {		printf("%d files.\n", indexfilenum1 + indexfilenum2);		printf("Reading stopwords...");	}	readoffsets(fp1);	readstopwords(fp1);	limit1 = offsets[STOPWORDPOS];	fileinfo1 = offsets[FILELISTPOS];	metaFile1 = readMergeMeta(metaFile1,fp1);	readoffsets(fp2);	readstopwords(fp2);	limit2 = offsets[STOPWORDPOS];	fileinfo2 = offsets[FILELISTPOS];	metaFile2 = readMergeMeta(metaFile2,fp2);        /* Create the merged list and modify the	   individual ones with the new meta index        */	   	metaEntryList = createMetaMerge(metaFile1, metaFile2);	if (verbose)		printf("\nReading file info...");	totalfiles=indexfilenum1+indexfilenum2;/*	fseek(fp1, fileinfo1, 0);	for (i = 1; i <= indexfilenum1; i++) {		fgets(line, MAXSTRLEN, fp1);		addindexfilelist(i, line, &totalfiles);	}	fseek(fp2, fileinfo2, 0);	for (i = 1; i <= indexfilenum2; i++) {		fgets(line, MAXSTRLEN, fp2);		addindexfilelist(i + indexfilenum1, line, &totalfiles);	}*/	if ((fp3 = fopen(outfile, "w")) == NULL) {		sprintf(errorstr,		"Couldn't write the merged index file \"%s\".",		outfile);		progerr(errorstr);	}	if (verbose)		printf("\nMerging words... ");	printheader(fp3, outfile, 0, totalfiles);        offsetstart = ftell(fp3);        for (i = 0; i < MAXCHARS; i++)                fprintf(fp3, "%016li", offsets[i]);        fputc('\n', fp3);	readoffsets(fp1);	readoffsets(fp2);	for (i = 0; i < MAXCHARS; i++)		offsets[i] = 0;	skipwords = 0;	while (1) {		if (buffer1 == 	NULL) {			ip1 = readindexline(fp1, limit1,metaFile1);			if (ip1 == NULL) {			  if (ip2 == NULL) {			    break;			  }			}			buffer1 = ip1;		}		if (buffer2 == NULL) {			ip2 = readindexline(fp2, limit2,metaFile2);			if (ip2 == NULL){			  if (ip1 == NULL) {			    break;			  }			}			else 			  addfilenums(ip2, indexfilenum1);			buffer2 = ip2;		}		if (ip1 == NULL)		  result = 1;		else if (ip2 == NULL)		  result = -1;		else		  result = strcmp(ip1->word, ip2->word);		if (!result) {			ip3 = mergeindexentries(ip1, ip2);			printindexentry(ip3, fp3);			freeindexentry(ip1);			freeindexentry(ip2);			freeindexentry(ip3);			ip1 = ip2 = ip3 = NULL;			buffer1 = buffer2 = NULL;			skipwords++;		}		else if (result < 0) {			printindexentry(ip1, fp3);			freeindexentry(ip1);			buffer1 = NULL;			ip1 = NULL;		}		else {			printindexentry(ip2, fp3);			freeindexentry(ip2);			buffer2 = NULL;			ip2 = NULL;		}	}	if (verbose) {		if (skipwords)			printf("%d redundant word%s.", skipwords,			(skipwords == 1) ? "" : "s");		else			printf("no redundant words.");	}        printstopwords(fp3);	if (verbose)		printf("\nMerging file info... ");	offsets[FILELISTPOS] = ftell(fp3);	fseek(fp1, fileinfo1, 0);	for (i= 1; i <= indexfilenum1; i++) {		addtofilehashlist(i-1, ftell(fp3));		fgets(line, MAXSTRLEN, fp1);		fputs(line, fp3);	}	fseek(fp2, fileinfo2, 0);	for (i= 1; i <= indexfilenum2; i++) {		addtofilehashlist(i+indexfilenum1-1, ftell(fp3));		fgets(line, MAXSTRLEN, fp2);		fputs(line, fp3);	}/*	for (i = j = 1; i <= indexfilenum1 + indexfilenum2; i++)		if (getmap(i) == j) {			addtofilehashlist(j++ - 1, ftell(fp3));			fprintf(fp3, "%s", lookupindexfilenum(i));		}*/	skipfiles = (indexfilenum1 + indexfilenum2) - totalfiles;	if (verbose) {		if (skipfiles)			printf("%d redundant file%s.", skipfiles,			(skipfiles == 1) ? "" : "s");		else			printf("no redundant files.");	}	printfileoffsets(fp3);	printMetaNames(fp3);	fseek(fp3, offsetstart, 0);        for (i = 0; i < MAXCHARS; i++)                fprintf(fp3, "%016li", offsets[i]);	fclose(fp3);	fclose(fp1);	fclose(fp2);	if (verbose)		printf("\nDone.\n");}/* Gets the number of files in an index file.*/int getindexfilenum(fp)     FILE *fp;{	int i;	char line[MAXSTRLEN];	readoffsets(fp);	fseek(fp, offsets[FILELISTPOS], 0);	i = 0;	while(ftell(fp) != offsets[FILEOFFSETPOS]) {		fgets(line, MAXSTRLEN, fp);		i++;	}	return i;}/* This adds an offset to the file numbers in a particular** result list. For instance, file 1 has file numbers going from** 1 to 10, but so does file 2, so I have to add 10 to all the** file numbers in file 2 before merging.*/void addfilenums(ip, num)     struct indexentry *ip;     int num;{	struct resultMerge *rp;	rp = ip->result;	while (rp != NULL) {		rp->filenum = encodefilenum(decodefilenum(rp->filenum) + num);		rp = rp->next;	}}/* This reads the next line in the index file and puts the results** in a result structure.*/struct indexentry *readindexline(FILE *fp, long limit, 				 struct metaMergeEntry * metaFile){        int i, c, x, countnum, rank=0, filenum=0, attribute;        char fileword[MAXWORDLEN];        struct resultMerge *rp;	struct indexentry *ip;	struct metaMergeEntry* tmp;        rp = NULL;	if (limit == ftell(fp))		return NULL;        for (i = 0; (c = fgetc(fp)) != 0; ) {                if (c == ':') {                        fileword[i] = '\0';                        break;                }                else                        fileword[i++] = c;        }        countnum = 1;        ungetc(c, fp);        while ((c = fgetc(fp)) != 0) {                x = 0;                do {                        c = fgetc(fp);                        if (c == 0)                                break;                        x *= 128;                        x += c & 127;                } while (c & 128);		if (c == 0)			break;                if (x) {                        if (countnum == 1) {                                filenum = x;                                countnum++;                        }                        else if (countnum == 2) {                                rank = x;                                countnum++;                        }                        else if (countnum == 3) {			        attribute = x;				/*Need to modify metaName with new list*/				for(tmp=metaFile;tmp;tmp=tmp->next) {				  if (tmp->oldIndex == attribute) {				    attribute = tmp->newIndex;				    break;				  }				}                                rp = addtoresultlistMerge(rp, filenum,				rank, attribute);				countnum = 1;                        }                }        }	ip = (struct indexentry *) emalloc(sizeof(struct indexentry));	ip->word = (char *) mystrdup(fileword);	ip->result = rp;        return ip;}/* This simply concatenates two information lists that correspond** to a word found in both index files.*/struct indexentry *mergeindexentries(struct indexentry *ip1, 				     struct indexentry *ip2){	struct resultMerge *newrp, *rp1, *rp2;	struct indexentry *ep;	rp1 = ip1->result;	rp2 = ip2->result;	newrp = NULL;	while (rp1 != NULL) {		newrp = addtoresultlistMerge(newrp, rp1->filenum, rp1->rank, 					     rp1->attribute);		rp1 = rp1->next;	}	while (rp2 != NULL) {		newrp = addtoresultlistMerge(newrp, rp2->filenum, rp2->rank, 					     rp2->attribute);		rp2 = rp2->next;	}	ep = (struct indexentry *) emalloc(sizeof(struct indexentry));	ep->word = (char *) mystrdup(ip1->word);	ep->result = newrp;	return ep;}/* This prints a new word entry into the merged index file,** removing redundant file information as it goes along.*/void printindexentry(ip, fp)     struct indexentry *ip;     FILE *fp;{	int i, num, attribute;	struct resultMerge *rp;	for (i = 0; indexchars[i] != '\0'; i++)		if ((ip->word)[0] == indexchars[i] && !offsets[i])			offsets[i] = ftell(fp);	fprintf(fp, "%s:", ip->word);	rp = ip->result;	while (rp != NULL) {		num = rp->filenum;		attribute = rp->attribute;		compress(num, fp);		compress(rp->rank, fp);		compress(rp->attribute,fp);		rp = rp->next;	}	fputc(0, fp);}#ifdef NDEF/* This associates a number with a new number.
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -