📄 index.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
/*** Copyright (C) 1995, Enterprise Integration Technologies Corp.        ** All Rights Reserved.** Kevin Hughes, kevinh@eit.com ** 3/11/94** ** Released under the GPL 1997**** Modifications to index Harvest SOIF records - Simon Wilkinson 12/10/97*/#include "swish.h"#include "index.h"#include "hash.h"#include "util.h"#include "template.h"#include "file.h"#include "string.h"#include "mem.h"#include "error.h"#include "check.h"#include "search.h"/* Recursively goes into a directory and calls the word-indexing** functions for each file that's found.*/void indexadir(char *dir){  DIR *dfd;#ifdef NEXTSTEP  struct direct *dp;#else  struct dirent *dp;#endif  static char s[MAXFILELEN];  struct fileentry *list=NULL;  struct fileentry *current=NULL,*tmp;    if (dir[strlen(dir) - 1] == '/')    dir[strlen(dir) - 1] = '\0';    if ((dfd = opendir(dir)) == NULL)    return;    while ((dp = readdir(dfd)) != NULL) {        if ((dp->d_name)[0] == '.')      continue;        sprintf(s, "%s%s%s", dir, dir[strlen(dir) - 1] == '/' ?	    "" : "/", dp->d_name);        if (!isdirectory(s)) {      countwords(s);    }    else {      tmp=emalloc(sizeof(struct fileentry));      tmp->filename= (char *)strdup(s);      tmp->next=NULL;      if (current==NULL)	list=tmp;      else	current->next=tmp;      current=tmp;    }  }  closedir(dfd);    while (list!=NULL) {    printf("Doing %s\n",list->filename);    indexadir(list->filename);    free(list->filename);    current=list;    list=list->next;    free(current);  }}/* Calls the word-indexing function for a single file.*/void indexafile(path)     char *path;{	countwords(path);}/* Adds a word to the master index tree.*/struct entry *addentry(e, word, filenum, attribute)     struct entry *e;     char *word;     int filenum;     int attribute;{  int isbigger;  struct location *tp, *oldtp=NULL;    if (e == NULL) {    e = (struct entry *) emalloc(sizeof(struct entry));    e->word = (char *) mystrdup(word);    e->tfrequency = 1;    e->locationlist = (struct location *)      emalloc(sizeof(struct location));    e->locationlist->filenum = filenum;    e->locationlist->frequency = 1;    e->locationlist->attribute = attribute;    e->locationlist->next = NULL;    e->left = e->right = NULL;    totalwords++;    return(e);  }  else {    isbigger = strcmp(e->word, word);        if (isbigger > 0) {      e->left = addentry(e->left, word, filenum, attribute);      return(e);    }        if (isbigger < 0) {      e->right = addentry(e->right, word, filenum, attribute);      return(e);    }        /* Don't need this - it must be true ---- if (isbigger == 0) */    tp = e->locationlist;    while (tp != NULL && !(tp->filenum == filenum &&                            tp->attribute == attribute)) {      oldtp = tp;      tp = tp->next;    }    if (tp == NULL) {	      tp = (struct location *) emalloc(sizeof(struct location));      tp->filenum = filenum;      tp->frequency = 1;      tp->attribute = attribute;      tp->next = NULL;      oldtp->next = tp;      e->tfrequency = e->tfrequency + 1;    }    else {	tp->frequency = tp->frequency + 1;    }  }  return e;}/* Adds a file to the master list of files and file numbers.*/struct file *addtofilelist(filep, filename, size)     struct file *filep;     char *filename;     int size;{        struct file *newnode;        static struct file *filelistp = NULL;        newnode = (struct file *) emalloc(sizeof(struct file));        newnode->filename = (char *) mystrdup(filename);        newnode->size = size;        newnode->next = NULL;        if (filep == NULL)                filep = newnode;        else if (filelistp != NULL)                filelistp->next = newnode;        filelistp = newnode;        return filep;}/* Just goes through the master list of files and** counts 'em.*/int getfilecount(filep)     struct file *filep;{        int i;        for (i = 0; filep != NULL; filep = filep->next)                i++;        return i;}/* Returns the nicely formatted date.*/char *getthedate(){	static char date[MAXSTRLEN];	time_t time;	time = (time_t) getthetime();	strftime(date, MAXSTRLEN, "%x %X", (struct tm *) localtime(&time));	strftime(date, MAXSTRLEN, "%d/%m/%y %H:%M:%S %Z",	(struct tm *) localtime(&time));	return date;}/* Indexes all the words in a file and adds the appropriate information** to the appropriate structures.*/int countwords(filename)     char *filename;{	int ftotalwords;	Template *template;	AVList *walker;        static int filenum;        FILE *fp;        if ((fp = fopen(filename, "r")) == NULL)                return 0;        ftotalwords = 0;        filelist = addtofilelist(filelist, filename, getsize(filename));        filenum++;        if (!(filenum % 128))                filenum++;	init_parse_template_file(fp);        template=parse_template();	if (template==NULL) 	   progerr("Badly formed SOIF file found");	else {	   ftotalwords+=add_attrval(filenum,"url",template->url,				    sizeof(template->url));	   	   for (walker = template->list; walker ; walker=walker->next) {	      if (walker->data->vsize == 0) continue;	      if (!strcmp(walker->data->attribute,"md5")) continue;              ftotalwords+=add_attrval(filenum,walker->data->attribute,				       walker->data->value,			               walker->data->vsize);           }        }        free_template(template);        fclose(fp);        addtofwordtotals(filenum, ftotalwords);        return(ftotalwords);}int add_attrval(int filenum, char *attr, char *val, size_t valsize) {  int c, i, inword, ftotalwords, validchar;  int attribute;  char word[MAXWORDLEN];  int ptr;    ftotalwords=0;  i = 0;  inword = 0;  attribute = 1;  ptr = 0;  validchar = 0;    while (ptr < valsize) {    c=val[ptr]; ptr++;    validchar = isalpha(c);        if (!inword) {      if (validchar) {	word[0] = tolower(c);	i = 1;	inword = 1;      }    }    else {      if (!validchar || ptr==valsize ) {	if (validchar) {	  word[i++] = tolower(c);	  if (i == MAXWORDLEN) i--;	}	word[i++] = '\0';	if (i>MINWORDLIMIT && !isstopword(word)) {	  entrylist = (struct entry *) addentry(entrylist, word,						filenum, attribute);	  ftotalwords++;	}	inword = 0;      }      else {	word[i++] = tolower(c);	if (i == MAXWORDLEN) i--;      }    }  }  return(ftotalwords);}/* Removes words that occur in over _plimit_ percent of the files and** that occur in over _flimit_ files (marks them as stopwords, that is).*/int removestops(ep, totalfiles, plimit, flimit)     struct entry *ep;     int totalfiles;     int plimit;     int flimit;{	int percent, wordfilecount, stopwords;        struct location *lp;	stopwords = 0;	if (ep != NULL) {		stopwords += removestops(ep->left, totalfiles, plimit, flimit);                lp = ep->locationlist;		wordfilecount = 0;                while (lp != NULL) {			wordfilecount++;                        lp = lp->next;                }		percent = ((float) wordfilecount / (float) totalfiles) * 100.0;		if (percent >= plimit && wordfilecount >= flimit) {			addStopList(ep->word);			addstophash(ep->word);			stopwords++;		}		stopwords += removestops(ep->right,totalfiles, plimit, flimit);	}	return stopwords;}/* This is somewhat similar to the rank calculation algorithm** from WAIS (I think). Any suggestions for improvements?** Note that ranks can't be smaller than 1, emphasized words** (words in titles, headers) have ranks multiplied by at least 5** (just a guess), and ranks divisible by 128 are bumped up by one** (to make the compression scheme with with '\0' as a line delimiter** work). Fudging with the ranks doesn't seem to make much difference.*/int getrank(freq, tfreq, words, emphasized)     int freq;     int tfreq;     int words;     int emphasized;{        float d, e, f;        int tmprank;        char rankstr[MAXSTRLEN];        if (freq < 5)                freq = 5;        d = 1.0 / (double) tfreq;        e = ((log((double) freq) + 10.0) * d) / words;        f = e * 10000.0;        sprintf(rankstr, "%f", f);        tmprank = atoi(rankstr);        if (tmprank <= 0)                tmprank = 1;        if (emphasized)                tmprank *= emphasized;        if (!(tmprank % 128))                tmprank++;        return tmprank;}/* Prints the index information at the head of index files.*/void printheader(fp, filename, totalwords, totalfiles)     FILE *fp;     char *filename;     int totalwords;     int totalfiles;{	char *c;	c = (char *) strrchr(filename, '/');	fprintf(fp, "%s\n", INDEXHEADER);	fprintf(fp, "# Name: %s\n", (indexn[0] == '\0') ? "(no name)" :	indexn);	fprintf(fp, "# Saved as: %s\n", (c == NULL && c + 1 != '\0') ?		filename : c + 1);	fprintf(fp, "# Counts: ");	if (totalwords)		fprintf(fp, "%d words%s", totalwords, (totalfiles) ? ", " : "");	if (totalfiles)		fprintf(fp, "%d files", totalfiles);	fprintf(fp, "\n");	fprintf(fp, "# Indexed on: %s\n", getthedate());	fprintf(fp, "# Description: %s\n", (indexd[0] == '\0') ?	"(no description)" : indexd);	fprintf(fp, "# Pointer: %s\n", (indexp[0] == '\0') ?	"(no pointer)" : indexp);	fprintf(fp, "# Maintained by: %s\n", (indexa[0] == '\0') ?	"(no maintainer)" : indexa);}/* Print the index entries that hold the word, rank, and other information.*/void printindex(ep, fp)     struct entry *ep;     FILE *fp;{        int i, rank;        struct location *lp;        if (ep != NULL) {                printindex(ep->left, fp);		if (!isstopword(ep->word)) {			for (i = 0; indexchars[i] != '\0'; i++)				if ((ep->word)[0] == indexchars[i] &&				!offsets[i])					offsets[i] = ftell(fp);	                fprintf(fp, "%s:", ep->word);			lp = ep->locationlist;			while (lp != NULL) {				compress(lp->filenum, fp);				rank = getrank(lp->frequency, ep->tfrequency,				gettotalwords(lp->filenum), 0);				compress(rank, fp);				compress(lp->attribute, fp);				lp = lp->next;                	}                	fputc(0, fp);		}                printindex(ep->right, fp);        }}/* Prints the list of stopwords into the index file.*/void printstopwords(fp)     FILE *fp;{	int hashval;	struct swline *sp;	offsets[STOPWORDPOS] = ftell(fp);	for (hashval = 0; hashval < HASHSIZE; hashval++) {        	sp = hashstoplist[hashval];		while (sp != NULL) {			fprintf(fp, "%s ", sp->line);			sp = sp->next;		}	}	fprintf(fp, "\n");}/* Prints the list of files, titles, and sizes into the index file.*/void printfilelist(filep, fp)     struct file *filep;     FILE *fp;{	int i;	i = 0;	offsets[FILELISTPOS] = ftell(fp);        while (filep != NULL) {		addtofilehashlist(i++, ftell(fp));                fprintf(fp, "%s %d\n", ruleparse(filep->filename), 			filep->size);                filep = filep->next;        }}/* Prints the list of metaNames into the file index*/void printMetaNames(fp)     FILE *fp;{  struct metaEntry* entry;    offsets[METANAMEPOS] = ftell(fp);  for (entry = metaEntryList; entry; entry = entry->next)    {      fprintf(fp, "%s ", entry->metaName);    }  fprintf(fp, "\n");}/* Prints the list of file offsets into the index file. */void printfileoffsets(fp)     FILE *fp;{	int i;	offsets[FILEOFFSETPOS] = ftell(fp);	for (i = 0; getfilenum(i) != 0; i++)		fprintf(fp, "%016li", getfilenum(i));	fprintf(fp,"\n");}/* Takes a number and prints it to a file using the simple** accordion scheme of storing numbers.*/void compress(num, fp)     int num;     FILE *fp;{        int i, r;        static char s[8];        i = 0;        while (num) {                r = num % 128;                num /= 128;                s[i++] = r;        }        while (i-- >= 0)                fputc(s[i] | (i ? 128 : 0), fp);}/* Prints out the decompressed values in an index file.*/void decompress(fp)     FILE *fp;{        int c, x, inword;	long pos;        char line[MAXSTRLEN], header[MAXHEADCHARS + 1];	readoffsets(fp);	fseek(fp, 0, 0);        inword = 1;        while (1) {                c = fgetc(fp);                ungetc(c, fp);                if (c == '#') {                        fgets(line, MAXSTRLEN, fp);			printf("%s", line);                        continue;                }                else {                        fgets(header, MAXHEADCHARS + 1, fp);			printf("%s", header);                        break;                }        }        while ((c = fgetc(fp)) != EOF) {                if (c == ':' && inword) {                        inword = 0;                        putchar(c);                }                if (inword)                        putchar(c);                else {                        x = 0;                        do {                                c = fgetc(fp);				pos = ftell(fp);				if (pos == offsets[STOPWORDPOS]) {					putchar('\n');					while (fgets(line, MAXSTRLEN, fp)					!= NULL)						printf("%s", line);						return;				}                                if (c == 0) {                                        putchar('\n');                                        inword = 1;                                        break;                                }                                x *= 128;                                x += c & 127;                        } while (c & 128);                        if (x)                                printf(" %d", x);                }        }}/* Parses lines according to the ReplaceRules directives.*/char *ruleparse(line)     char *line;{        char rule[MAXSTRLEN];        static char tmpline[MAXSTRLEN], newtmpline[MAXSTRLEN];        static char line1[MAXSTRLEN], line2[MAXSTRLEN];        struct swline *tmplist;	if (replacelist == NULL)		return line;        tmplist = replacelist;        strcpy(tmpline, line);        while(1) {		if (tmplist == NULL)			return tmpline;                strcpy(rule, tmplist->line);		tmplist = tmplist->next;		if (tmplist == NULL)			return tmpline;                if (rule == NULL) {                        replacelist = tmplist;                        return tmpline;                }                else {                        if (lstrstr(rule, "replace")) {                                strcpy(line1, tmplist->line);				tmplist = tmplist->next;                                strcpy(line2, tmplist->line);				tmplist = tmplist->next;                                strcpy(newtmpline, (char *) replace(tmpline,                                line1, NOWORD));                                strcpy(newtmpline, (char *) replace(newtmpline,                                NOWORD, line2));                        }                        else if (lstrstr(rule, "append")) {				sprintf(newtmpline, "%s%s", tmpline,				tmplist->line);				tmplist = tmplist->next;			}			else if (lstrstr(rule, "prepend")) {				sprintf(newtmpline, "%s%s", tmplist->line,				tmpline);				tmplist = tmplist->next;			}			strcpy(tmpline, newtmpline);                }        }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -