⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 localparsers.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "localParsers.c,v 1.3 1995/08/04 02:15:28 duane Exp";/* *  Contains a parser for Harvest's Summary Object Interchange Format (SOIF) *  http://harvest.cs.colorado.edu/. * *  There are a few known problems with this parser: *    - WAIS parses files line-by-line, so this does not support binary data *    - Parsing SOIF attributes is not exact, because the parser needs  *      to guess on every line with which attribute to associate the data. *    - Only a total of MAX_FIELD (256) different attributes are supported. *      Any SOIF attribute after MAX_FIELD is ignored (on a first-come, *      first-serve basis). *    - It's slow, since it needs to test to see if the current line *      starts a new attribute or not (worst case is 2 sscanf's per line). *    - Only works with SOIF from a Broker (e.g., attribute names are all *      lower case). *    - All WAIS field names cannot (evidently) contain any *      non-alpha-numeric characters. * *  However, this parser does work well for most SOIF data... * *  Written by Darren Hardy, hardy@cs.colorado.edu, April 1995 */#define localParser_c#include <string.h>#include <ctype.h>#include "localParsers.h"/* define to allow only SOIF attributes that are all lower-case *//* Harvest Brokers only use lower-case attributes */#ifndef SOIF_LOWERCASE_ONLY#define SOIF_LOWERCASE_ONLY#endif/* define for debugging output to stderr */#ifndef SOIF_DEBUG#undef SOIF_DEBUG#endif/* define to strip SOIF attribute names of non-alpha-numeric characters */#ifndef SOIF_STRIP_ATTR#define SOIF_STRIP_ATTR#endif/* *  verify_attr() - Returns non-zero if the attr is a valid SOIF attr. */static int soif_verify_attr(char *attr){  char *p;  int ok = 0;  /* needs to have a (lowercase) alpha character */  for (p = attr; *p; p++) {#ifdef SOIF_LOWERCASE_ONLY    if (islower((unsigned char) *p)) {#else    if (isalpha((unsigned char) *p)) {#endif      ok = 1;      break;    }  }  /* needs to be at least 2 characters long */  if (attr[0] == '\0' || attr[1] == '\0') {    ok = 0;  }  return(ok);}/* strips attr of all non-alpha-numeric characters */static void strip_attr(attr)char *attr;{	int i,j;	char s[BUFSIZ];	for (i = j = 0; attr[i]; i++) 		if (isalnum((unsigned char) attr[i])) 			s[j++] = attr[i];	s[j] = '\0';	strcpy(attr, s);}/* maps attribute name into fieldID */static char *soif_attrtofid[MAX_FIELD+1];/* *  grab_fieldID() - Returns the fieldID number for attr;  if it's not *  registered already, then it returns the next available fieldID */static long soif_grab_attrtofid(char *attr){   long i, min_field;   static int initialized = 0;   if (!initialized) {     initialized = 1;     memset(soif_attrtofid, '\0', sizeof(char *) * (MAX_FIELD+1));   }   for (i = min_field = MIN_FIELD; i <= MAX_FIELD; i++) {     if (soif_attrtofid[i] != NULL) {       min_field = i;       if (!strcmp(attr, soif_attrtofid[i])) {         return (i);       }     }   }   if (min_field == MAX_FIELD && soif_attrtofid[MAX_FIELD] != NULL) {     return(MIN_FIELD);	/* error: too many attributes */   }   soif_attrtofid[++min_field] = safeStrdup(attr);#ifdef SOIF_DEBUG  fprintf(stderr, "ADDED %d %s\n", min_field, soif_attrtofid[min_field]);#endif   return (min_field);}/*---------------------------------------------------------------------------*/fieldInfo*soifField(char* line,long* fieldID){	char attr[8192];	int x, vsize;	static long previous_fieldID = MIN_FIELD;	attr[0] = '\0';	x = vsize = -1;	/* sscanf's are expensive, try quickies first */	/* assumes all attribute names are lowercase as is in Brokers */	if (((line[0] == 'e' && /* quickie */#ifdef SOIF_LOWERCASE_ONLY              sscanf(line,"embed<%d>-%[a-z0-9-]{%d}:\t",&x,attr,&vsize)==3)||#else              sscanf(line,"embed<%d>-%[A-Za-z0-9-]{%d}:\t",&x,attr,&vsize)==3)||#endif              (isalnum((unsigned char) line[0]) && /* quickie, always char or digit */#ifdef SOIF_LOWERCASE_ONLY              sscanf(line, "%[a-z0-9-]{%d}:\t", &attr, &vsize) == 2)) &&#else              sscanf(line, "%[A-Za-z0-9-]{%d}:\t", &attr, &vsize) == 2)) &&#endif              soif_verify_attr(attr)) {#ifndef SOIF_LOWERCASE_ONLY{		/* convert attribute to all lowercase */		char *p;		for (p = attr; *p; p++)			*p = tolower((unsigned char) *p);}#endif#ifdef SOIF_STRIP_ATTR		strip_attr(attr);#endif		*fieldID = soif_grab_attrtofid(attr);#ifdef SOIF_DEBUG  fprintf(stderr, "GRABBED (embed %d) %d %s\n", x == -1 ?0:1, *fieldID, attr);#endif		previous_fieldID = *fieldID;   		if (fieldIsDefined[*fieldID] == false) {         		fieldInfo* fd = makeFieldInfo();        		fieldIsDefined[*fieldID] = true;        		fd->fieldID = *fieldID;                        if (*fieldID == MIN_FIELD) {        		  fd->names = collecting(fd->names,safeStrdup("body"));        		  fd->description = safeStrdup("body");			} else {        		  fd->names = collecting(fd->names,safeStrdup(attr));        		  fd->description = safeStrdup(attr);			}        		fd->fieldType = TEXT_FIELD;        		fd->lowerBoundSet = false;        		fd->upperBoundSet = false;        		return(fd);      		}                return(NULL);   	} 	*fieldID = previous_fieldID;	/* associate data with prev attr */   	return(NULL);}/*---------------------------------------------------------------------------*/longsoifGatherHeadline(char* line){  char *s;  /* this is called on every line so be quick about it... */  if (line[0] == '@' && theHeadline[0] == '\0') {     /* save away the URL */     if ((s = strchr(line, '{')) == NULL)       return IN_BODY;     else       s++;     while (isspace((unsigned char) *s))       s++;     strncpy(theHeadline,s,MAX_HEADLINE_LEN);     return(IN_HEADLINE);  }   return IN_BODY;}/*---------------------------------------------------------------------------*/void soifFinishHeadline(char* headline,char* filename){  char *s;  if (theHeadline[0]) {    /* Use the OBJxxx first in the headline */    if ((s = strrchr(filename, '/')) == NULL) {      s = filename;    } else {      s++;    }    /* Headline is ``OBJxxx URL'' */    strncpy(headline,s,MAX_HEADLINE_LEN);    strcat(headline, " ");    strncat(headline,theHeadline,MAX_HEADLINE_LEN-strlen(s)-2);    theHeadline[0] = '\0';  } else {    strcpy(headline,"Unknown Headline");  }}/*---------------------------------------------------------------------------*/parserInfo localParserList[] = {  defParser("soif",            "Harvest's Summary Object Interchange Format",            "TEXT",            true, /* true to index contents, false otherwise */            NULL,            soifField,            soifGatherHeadline,            NULL,            NULL,            NULL,            NULL,            soifFinishHeadline,            NULL           ),  defParser(NULL,            NULL,            NULL,            false,            NULL,            NULL,            NULL,            NULL,            NULL,            NULL,            NULL,            NULL,            NULL           )};/*  The localParserList is a null-terminated list of defParser structures;    DO NOT DELETE the NULL structure at the end! *//*---------------------------------------------------------------------------*/

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -