📄 search.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*** Copyright (C) 1995, Enterprise Integration Technologies Corp.        ** All Rights Resereved.** Kevin Hughes, kevinh@eit.com ** 3/11/94**** Released under the GPL by EIT**** Heavily hacked for Harvest*/#include "swish.h"#include "search.h"#include "file.h"#include "list.h"#include "string.h"#include "hash.h"#include "merge.h"#include "mem.h"/* The main search function.** Parentheses are stripped out, things made lowercase,** extra blanks removed, etc.*/void search(words, indexlist)     char *words;     struct swline *indexlist;{  int i, j, metaName;  float num;  char word[MAXWORDLEN];  struct result *resultlist;  struct sortresult *sortresultlist;  struct swline *tmplist;  FILE *fp;#ifdef DEBUG  struct swline *newp2;#endif    searchwordlist = NULL;  metaName = 1;    for (i = j = 0; words[i] != '\0' && words[i] != '\n'; i++) {    if (isspace(words[i]) || words[i] == '(' ||	words[i] == ')' || words[i] == '=') {      if (words[i] == '=')	{ if (j != 0)	    {if (words[i-1] != '/')	       { 		 word[j] = '\0';		 searchwordlist = (struct swline *)		   addswline(searchwordlist, (char *)			     convertentities(word));		 j = 0;		 searchwordlist = (struct swline *)		   addswline(searchwordlist, "=");	       }	     else	       { /* Needs to erase the '/' */		 j--;		 word[j++] = tolower(words[i]);	       }	   }	  else	    searchwordlist = (struct swline *)	      addswline(searchwordlist, "=");	}      else	{	  if (j) {	    word[j] = '\0';	    searchwordlist = (struct swline *)	      addswline(searchwordlist, (char *)			convertentities(word));	    j = 0;	  }	  if (words[i] == '(') {	    searchwordlist = (struct swline *)	      addswline(searchwordlist, "(");	  }	  if (words[i] == ')') {	    searchwordlist = (struct swline *)	      addswline(searchwordlist, ")");	  }	}    }    else      word[j++] = tolower(words[i]);  }  if (j) {    word[j] = '\0';    searchwordlist = (struct swline *)      addswline(searchwordlist, (char *) convertentities(word));  }    printf("%s\n", INDEXHEADER);  if (words[0] == '\0') {    printf("err: no search words specified\n.\n");    exit(0);  }    printf("# Search words:");  tmplist = searchwordlist;  while (tmplist != NULL) {    printf(" %s", tmplist->line);    tmplist = tmplist->next;  }  putchar('\n');    while (indexlist != NULL) {        commonerror = bigrank = 0;        if ((fp = fopen(indexlist->line, "r")) == NULL) {      printf("# Name: unknown index\n");      printf("err: could not open index file\n.\n");      exit(0);    }        if (!isokindexheader(fp)) {      printf("err: the index file format is unknown\n.\n");      exit(0);    }        getheader(fp);        if (!getindexfilenum(fp)) {      printf("err: the index file is empty\n.\n");      exit(0);    }        readoffsets(fp);    readstopwords(fp);    readfileoffsets(fp);    readMetaNames(fp);        resultlist = NULL;    tmplist = searchwordlist;    tmplist = (struct swline *) fixnot(tmplist, fp);    searchwordlist = (struct swline *) expandstar(tmplist, fp);#ifdef DEBUG    newp2 = searchwordlist;    while (newp2 != NULL) {      printf("%s ", newp2->line);      newp2 = newp2->next;    }    putchar('\n');#endif    resultlist = (struct result *) parseterm(fp, 0, metaName);        sortresultlist = NULL;    while (resultlist != NULL) {      sortresultlist = (struct sortresult *)	  addsortresult(sortresultlist, resultlist->rank,			lookupfile(resultlist->filenum, fp));      resultlist = resultlist->next;    }        fclose(fp);        if (sortresultlist == NULL) {      if (commonerror)	printf("err: a word is too common\n");      else	printf("err: no results\n");    }    else {      if (bigrank)	num = 1000.0 / (float) bigrank;      else	num = 1000;      printsortedresults(sortresultlist, num);    }        searchwordlist = tmplist;    indexlist = indexlist->next;      }    printf(".\n");}/* This puts parentheses in the right places around not structures** so the parser can do its thing correctly.** It does it both for 'not' and '='; the '=' is used for the METADATA (GH)*/struct swline *fixnot(sp)     struct swline *sp;{	int openparen, hasnot;	int openMeta, hasMeta;	struct swline *tmpp, *newp;#ifdef DEBUG	struct swline *newp2;#endif	tmpp = sp;	newp = NULL;	openparen = 0;	openMeta = 0;	hasMeta = 0;	hasnot = 0;	while (tmpp != NULL) {		if ( ((tmpp->line)[0] == '(') && hasnot)			openparen++;		else if ( ((tmpp->line)[0] == '(') && hasMeta)		        openMeta++;		else if ( ((tmpp->line)[0] == ')') && hasnot)			openparen--;		else if ( ((tmpp->line)[0] == ')') && hasMeta)		        openMeta--;		if (isMetaName(tmpp->next)) {		  /* If it is a metaName add the name and = and skip to next */		  hasMeta = 1;		  newp = (struct swline *) addswline(newp, "(");		  newp = (struct swline *) addswline(newp, tmpp->line);                  newp = (struct swline *) addswline(newp, "=");		  tmpp = tmpp->next;		  tmpp = tmpp->next;		  continue;		}		if (!strcmp(tmpp->line, "not") ) {			hasnot = 1;			newp = (struct swline *) addswline(newp, "(");		}		else if (hasnot && !openparen) {			hasnot = 0;			newp = (struct swline *) addswline(newp, tmpp->line);			newp = (struct swline *) addswline(newp, ")");			tmpp = tmpp->next;			continue;		}		else if (hasMeta && !openMeta) {		        hasMeta = 0;		        newp = (struct swline *) addswline(newp, tmpp->line);		        newp = (struct swline *) addswline(newp, ")");		        tmpp = tmpp->next;		        continue;		}		newp = (struct swline *) addswline(newp, tmpp->line);		if (!strcmp(tmpp->line, "=") ) {		        hasMeta = 1;		        newp = (struct swline *) addswline(newp, "(");		}		tmpp = tmpp->next;	}#ifdef DEBUG	newp2 = newp;	while (newp2 != NULL) {		printf("%s ", newp2->line);		newp2 = newp2->next;	}	putchar('\n');#endif	return newp;}/* Expands words with asterisks as wildcards into a series of** "or" searches. Terms like "quick\*" are expanded into** "quicktime or quickly", etc.*/struct swline *expandstar(sp, fp)     struct swline *sp;     FILE *fp;{	int i, firsttime, gotstar;	char foundword[MAXWORDLEN], searchword[MAXWORDLEN];        struct swline *newp;	newp = NULL;	while (sp != NULL) {		strcpy(searchword, sp->line);		if (searchword[0] != '*' && strchr(searchword, '*')) {			for (i = gotstar = 0; searchword[i]; i++)				if (gotstar)					searchword[i] = '\0';				else if (searchword[i] == '*') {					searchword[i] = '\0';					gotstar = 1;				}			firsttime = 0;			do {				strcpy(foundword, getmatchword(searchword,				fp, firsttime));				if (strcmp(foundword, NOWORD)) {				        /* Add "(" if it is the first time */				        if (firsttime == 0) 					        newp = (struct swline *)					             addswline(newp, "(");					if (firsttime)						newp = (struct swline *)						addswline(newp, "or");					newp = (struct swline *)					addswline(newp, foundword);				}				else {					if (!firsttime)						newp = (struct swline *)						addswline(newp, NOWORD);					else  /*Add ")" if last of many */					  newp = (struct swline *)					       addswline(newp, ")");					break;				}				firsttime++;			} while (strcmp(foundword, NOWORD));		}		else {			newp = (struct swline *) addswline(newp,			searchword);		}		sp = sp->next;	}	return newp;}/* If firsttime is 1, returns the first match to a beginnng of a word.** Else if it's 0, returns the next match, until nothing is found,** in which case NULL is returned.*/char *getmatchword(word, fp, firsttime)     char *word;     FILE *fp;     int firsttime;{	int i, c, found;        char *d;	static char fileword[MAXWORDLEN];	if (!firsttime) {		for (i = found = 0; indexchars[i] != '\0'; i++)			if (word[0] == indexchars[i]) {				fseek(fp, offsets[i], 0);				found = 1;			}		if (!found)			return NOWORD;	}	if (offsets[STOPWORDPOS] == ftell(fp))		return NOWORD;        for (i = 0; (c = fgetc(fp)) != 0; ) {                if (c == ':') {                        fileword[i] = '\0';			i = 0;			while ((c = fgetc(fp)) != 0)				;			if (fileword[0] != word[0])				return NOWORD;			d = (char *) strstr(fileword, word);                        if (d != NULL && d == &fileword[0])                                return fileword;			else {				if (offsets[STOPWORDPOS] == ftell(fp))					return NOWORD;			}                }		else			fileword[i++] = c;	}	return NOWORD;}/* Reads and prints the header of an index file.*/void getheader(fp)     FILE *fp;{	int c;	char line[MAXSTRLEN];	fgets(line, MAXSTRLEN, fp);        while (1) {                c = fgetc(fp);                ungetc(c, fp);                if (c == '#') {                        fgets(line, MAXSTRLEN, fp);                        printf("%s", line);                        continue;                }		else			break;	}	fseek(fp, 0, 0);}/* Reads the offsets in the index file so word lookup is faster.*/void readoffsets(fp)     FILE *fp;{	int c, i, k;	long j, num;	for (i = 0; i < MAXCHARS; i++)		offsets[i] = 0;	fseek(fp, 0, 0);	while (1) {		c = fgetc(fp);		if (c == '#') {			do {				c = fgetc(fp);			} while (c && c != '\n');			continue;		}		else			break;	}        j = 0;        while (c != EOF && c != '\n') {		k = MAXLONGLEN;                for (num = 0; c && isdigit(c) && k--; ) {                        num = (num * 10) + (c - '0');			c = fgetc(fp);		}                offsets[j++] = num;        }}/* Reads the stopwords in the index file.*/void readstopwords(fp)     FILE *fp;{	int i, c;	char word[MAXWORDLEN];	fseek(fp, offsets[STOPWORDPOS], 0);	for (i = 0; (c = fgetc(fp)) != '\n' && c != EOF; )		if (!isspace(c))			word[i++] = c;		else {			word[i] = '\0';			addstophash(word);			i = 0;		}}/* Reads the metaNames from the index*/void readMetaNames(fp)     FILE *fp;{	int i, c;	char word[MAXWORDLEN];	fseek(fp, offsets[METANAMEPOS], 0);	for (i = 0; (c = fgetc(fp)) != '\n' && c != EOF; )		if (!isspace(c))			word[i++] = c;		else {			word[i] = '\0';			metaEntryList = addMetaEntry(metaEntryList, word);			i = 0;		}}/* Reads the file offset table in the index file.*/void readfileoffsets(fp)     FILE *fp;{	int j, k, c;	long num;        j = 0;        fseek(fp, offsets[FILEOFFSETPOS], 0);        c = fgetc(fp);        while (c != EOF && c != '\n') {                k = MAXLONGLEN;                for (num = 0; c != EOF && isdigit(c) && k--; ) {                        num = (num * 10) + (c - '0');                        c = fgetc(fp);                }                addtofilehashlist(j++, num);        }}/* The recursive parsing function.** This was a headache to make but ended up being surprisingly easy. :)** parseone tells the function to only operate on one word or term.*/struct result *parseterm(fp, parseone, metaName)     FILE *fp;     int parseone;     int metaName;{	int rulenum;        char word[MAXWORDLEN];	struct result *rp, *newrp;	rp = NULL;
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -