📄 index.c
字号:
/*** Copyright (C) 1995, Enterprise Integration Technologies Corp. ** All Rights Reserved.** Kevin Hughes, kevinh@eit.com ** 3/11/94** ** Released under the GPL 1997**** Modifications to index Harvest SOIF records - Simon Wilkinson 12/10/97*/#include "swish.h"#include "index.h"#include "hash.h"#include "util.h"#include "template.h"#include "file.h"#include "string.h"#include "mem.h"#include "error.h"#include "check.h"#include "search.h"/* Recursively goes into a directory and calls the word-indexing** functions for each file that's found.*/void indexadir(char *dir){ DIR *dfd;#ifdef NEXTSTEP struct direct *dp;#else struct dirent *dp;#endif static char s[MAXFILELEN]; struct fileentry *list=NULL; struct fileentry *current=NULL,*tmp; if (dir[strlen(dir) - 1] == '/') dir[strlen(dir) - 1] = '\0'; if ((dfd = opendir(dir)) == NULL) return; while ((dp = readdir(dfd)) != NULL) { if ((dp->d_name)[0] == '.') continue; sprintf(s, "%s%s%s", dir, dir[strlen(dir) - 1] == '/' ? "" : "/", dp->d_name); if (!isdirectory(s)) { countwords(s); } else { tmp=emalloc(sizeof(struct fileentry)); tmp->filename= (char *)strdup(s); tmp->next=NULL; if (current==NULL) list=tmp; else current->next=tmp; current=tmp; } } closedir(dfd); while (list!=NULL) { printf("Doing %s\n",list->filename); indexadir(list->filename); free(list->filename); current=list; list=list->next; free(current); }}/* Calls the word-indexing function for a single file.*/void indexafile(path) char *path;{ countwords(path);}/* Adds a word to the master index tree.*/struct entry *addentry(e, word, filenum, attribute) struct entry *e; char *word; int filenum; int attribute;{ int isbigger; struct location *tp, *oldtp=NULL; if (e == NULL) { e = (struct entry *) emalloc(sizeof(struct entry)); e->word = (char *) mystrdup(word); e->tfrequency = 1; e->locationlist = (struct location *) emalloc(sizeof(struct location)); e->locationlist->filenum = filenum; e->locationlist->frequency = 1; e->locationlist->attribute = attribute; e->locationlist->next = NULL; e->left = e->right = NULL; totalwords++; return(e); } else { isbigger = strcmp(e->word, word); if (isbigger > 0) { e->left = addentry(e->left, word, filenum, attribute); return(e); } if (isbigger < 0) { e->right = addentry(e->right, word, filenum, attribute); return(e); } /* Don't need this - it must be true ---- if (isbigger == 0) */ tp = e->locationlist; while (tp != NULL && !(tp->filenum == filenum && tp->attribute == attribute)) { oldtp = tp; tp = tp->next; } if (tp == NULL) { tp = (struct location *) emalloc(sizeof(struct location)); tp->filenum = filenum; tp->frequency = 1; tp->attribute = attribute; tp->next = NULL; oldtp->next = tp; e->tfrequency = e->tfrequency + 1; } else { tp->frequency = tp->frequency + 1; } } return e;}/* Adds a file to the master list of files and file numbers.*/struct file *addtofilelist(filep, filename, size) struct file *filep; char *filename; int size;{ struct file *newnode; static struct file *filelistp = NULL; newnode = (struct file *) emalloc(sizeof(struct file)); newnode->filename = (char *) mystrdup(filename); newnode->size = size; newnode->next = NULL; if (filep == NULL) filep = newnode; else if (filelistp != NULL) filelistp->next = newnode; filelistp = newnode; return filep;}/* Just goes through the master list of files and** counts 'em.*/int getfilecount(filep) struct file *filep;{ int i; for (i = 0; filep != NULL; filep = filep->next) i++; return i;}/* Returns the nicely formatted date.*/char *getthedate(){ static char date[MAXSTRLEN]; time_t time; time = (time_t) getthetime(); strftime(date, MAXSTRLEN, "%x %X", (struct tm *) localtime(&time)); strftime(date, MAXSTRLEN, "%d/%m/%y %H:%M:%S %Z", (struct tm *) localtime(&time)); return date;}/* Indexes all the words in a file and adds the appropriate information** to the appropriate structures.*/int countwords(filename) char *filename;{ int ftotalwords; Template *template; AVList *walker; static int filenum; FILE *fp; if ((fp = fopen(filename, "r")) == NULL) return 0; ftotalwords = 0; filelist = addtofilelist(filelist, filename, getsize(filename)); filenum++; if (!(filenum % 128)) filenum++; init_parse_template_file(fp); template=parse_template(); if (template==NULL) progerr("Badly formed SOIF file found"); else { ftotalwords+=add_attrval(filenum,"url",template->url, sizeof(template->url)); for (walker = template->list; walker ; walker=walker->next) { if (walker->data->vsize == 0) continue; if (!strcmp(walker->data->attribute,"md5")) continue; ftotalwords+=add_attrval(filenum,walker->data->attribute, walker->data->value, walker->data->vsize); } } free_template(template); fclose(fp); addtofwordtotals(filenum, ftotalwords); return(ftotalwords);}int add_attrval(int filenum, char *attr, char *val, size_t valsize) { int c, i, inword, ftotalwords, validchar; int attribute; char word[MAXWORDLEN]; int ptr; ftotalwords=0; i = 0; inword = 0; attribute = 1; ptr = 0; validchar = 0; while (ptr < valsize) { c=val[ptr]; ptr++; validchar = isalpha(c); if (!inword) { if (validchar) { word[0] = tolower(c); i = 1; inword = 1; } } else { if (!validchar || ptr==valsize ) { if (validchar) { word[i++] = tolower(c); if (i == MAXWORDLEN) i--; } word[i++] = '\0'; if (i>MINWORDLIMIT && !isstopword(word)) { entrylist = (struct entry *) addentry(entrylist, word, filenum, attribute); ftotalwords++; } inword = 0; } else { word[i++] = tolower(c); if (i == MAXWORDLEN) i--; } } } return(ftotalwords);}/* Removes words that occur in over _plimit_ percent of the files and** that occur in over _flimit_ files (marks them as stopwords, that is).*/int removestops(ep, totalfiles, plimit, flimit) struct entry *ep; int totalfiles; int plimit; int flimit;{ int percent, wordfilecount, stopwords; struct location *lp; stopwords = 0; if (ep != NULL) { stopwords += removestops(ep->left, totalfiles, plimit, flimit); lp = ep->locationlist; wordfilecount = 0; while (lp != NULL) { wordfilecount++; lp = lp->next; } percent = ((float) wordfilecount / (float) totalfiles) * 100.0; if (percent >= plimit && wordfilecount >= flimit) { addStopList(ep->word); addstophash(ep->word); stopwords++; } stopwords += removestops(ep->right,totalfiles, plimit, flimit); } return stopwords;}/* This is somewhat similar to the rank calculation algorithm** from WAIS (I think). Any suggestions for improvements?** Note that ranks can't be smaller than 1, emphasized words** (words in titles, headers) have ranks multiplied by at least 5** (just a guess), and ranks divisible by 128 are bumped up by one** (to make the compression scheme with with '\0' as a line delimiter** work). Fudging with the ranks doesn't seem to make much difference.*/int getrank(freq, tfreq, words, emphasized) int freq; int tfreq; int words; int emphasized;{ float d, e, f; int tmprank; char rankstr[MAXSTRLEN]; if (freq < 5) freq = 5; d = 1.0 / (double) tfreq; e = ((log((double) freq) + 10.0) * d) / words; f = e * 10000.0; sprintf(rankstr, "%f", f); tmprank = atoi(rankstr); if (tmprank <= 0) tmprank = 1; if (emphasized) tmprank *= emphasized; if (!(tmprank % 128)) tmprank++; return tmprank;}/* Prints the index information at the head of index files.*/void printheader(fp, filename, totalwords, totalfiles) FILE *fp; char *filename; int totalwords; int totalfiles;{ char *c; c = (char *) strrchr(filename, '/'); fprintf(fp, "%s\n", INDEXHEADER); fprintf(fp, "# Name: %s\n", (indexn[0] == '\0') ? "(no name)" : indexn); fprintf(fp, "# Saved as: %s\n", (c == NULL && c + 1 != '\0') ? filename : c + 1); fprintf(fp, "# Counts: "); if (totalwords) fprintf(fp, "%d words%s", totalwords, (totalfiles) ? ", " : ""); if (totalfiles) fprintf(fp, "%d files", totalfiles); fprintf(fp, "\n"); fprintf(fp, "# Indexed on: %s\n", getthedate()); fprintf(fp, "# Description: %s\n", (indexd[0] == '\0') ? "(no description)" : indexd); fprintf(fp, "# Pointer: %s\n", (indexp[0] == '\0') ? "(no pointer)" : indexp); fprintf(fp, "# Maintained by: %s\n", (indexa[0] == '\0') ? "(no maintainer)" : indexa);}/* Print the index entries that hold the word, rank, and other information.*/void printindex(ep, fp) struct entry *ep; FILE *fp;{ int i, rank; struct location *lp; if (ep != NULL) { printindex(ep->left, fp); if (!isstopword(ep->word)) { for (i = 0; indexchars[i] != '\0'; i++) if ((ep->word)[0] == indexchars[i] && !offsets[i]) offsets[i] = ftell(fp); fprintf(fp, "%s:", ep->word); lp = ep->locationlist; while (lp != NULL) { compress(lp->filenum, fp); rank = getrank(lp->frequency, ep->tfrequency, gettotalwords(lp->filenum), 0); compress(rank, fp); compress(lp->attribute, fp); lp = lp->next; } fputc(0, fp); } printindex(ep->right, fp); }}/* Prints the list of stopwords into the index file.*/void printstopwords(fp) FILE *fp;{ int hashval; struct swline *sp; offsets[STOPWORDPOS] = ftell(fp); for (hashval = 0; hashval < HASHSIZE; hashval++) { sp = hashstoplist[hashval]; while (sp != NULL) { fprintf(fp, "%s ", sp->line); sp = sp->next; } } fprintf(fp, "\n");}/* Prints the list of files, titles, and sizes into the index file.*/void printfilelist(filep, fp) struct file *filep; FILE *fp;{ int i; i = 0; offsets[FILELISTPOS] = ftell(fp); while (filep != NULL) { addtofilehashlist(i++, ftell(fp)); fprintf(fp, "%s %d\n", ruleparse(filep->filename), filep->size); filep = filep->next; }}/* Prints the list of metaNames into the file index*/void printMetaNames(fp) FILE *fp;{ struct metaEntry* entry; offsets[METANAMEPOS] = ftell(fp); for (entry = metaEntryList; entry; entry = entry->next) { fprintf(fp, "%s ", entry->metaName); } fprintf(fp, "\n");}/* Prints the list of file offsets into the index file. */void printfileoffsets(fp) FILE *fp;{ int i; offsets[FILEOFFSETPOS] = ftell(fp); for (i = 0; getfilenum(i) != 0; i++) fprintf(fp, "%016li", getfilenum(i)); fprintf(fp,"\n");}/* Takes a number and prints it to a file using the simple** accordion scheme of storing numbers.*/void compress(num, fp) int num; FILE *fp;{ int i, r; static char s[8]; i = 0; while (num) { r = num % 128; num /= 128; s[i++] = r; } while (i-- >= 0) fputc(s[i] | (i ? 128 : 0), fp);}/* Prints out the decompressed values in an index file.*/void decompress(fp) FILE *fp;{ int c, x, inword; long pos; char line[MAXSTRLEN], header[MAXHEADCHARS + 1]; readoffsets(fp); fseek(fp, 0, 0); inword = 1; while (1) { c = fgetc(fp); ungetc(c, fp); if (c == '#') { fgets(line, MAXSTRLEN, fp); printf("%s", line); continue; } else { fgets(header, MAXHEADCHARS + 1, fp); printf("%s", header); break; } } while ((c = fgetc(fp)) != EOF) { if (c == ':' && inword) { inword = 0; putchar(c); } if (inword) putchar(c); else { x = 0; do { c = fgetc(fp); pos = ftell(fp); if (pos == offsets[STOPWORDPOS]) { putchar('\n'); while (fgets(line, MAXSTRLEN, fp) != NULL) printf("%s", line); return; } if (c == 0) { putchar('\n'); inword = 1; break; } x *= 128; x += c & 127; } while (c & 128); if (x) printf(" %d", x); } }}/* Parses lines according to the ReplaceRules directives.*/char *ruleparse(line) char *line;{ char rule[MAXSTRLEN]; static char tmpline[MAXSTRLEN], newtmpline[MAXSTRLEN]; static char line1[MAXSTRLEN], line2[MAXSTRLEN]; struct swline *tmplist; if (replacelist == NULL) return line; tmplist = replacelist; strcpy(tmpline, line); while(1) { if (tmplist == NULL) return tmpline; strcpy(rule, tmplist->line); tmplist = tmplist->next; if (tmplist == NULL) return tmpline; if (rule == NULL) { replacelist = tmplist; return tmpline; } else { if (lstrstr(rule, "replace")) { strcpy(line1, tmplist->line); tmplist = tmplist->next; strcpy(line2, tmplist->line); tmplist = tmplist->next; strcpy(newtmpline, (char *) replace(tmpline, line1, NOWORD)); strcpy(newtmpline, (char *) replace(newtmpline, NOWORD, line2)); } else if (lstrstr(rule, "append")) { sprintf(newtmpline, "%s%s", tmpline, tmplist->line); tmplist = tmplist->next; } else if (lstrstr(rule, "prepend")) { sprintf(newtmpline, "%s%s", tmplist->line, tmpline); tmplist = tmplist->next; } strcpy(tmpline, newtmpline); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -