📄 localparsers.c
字号:
static char rcsid[] = "localParsers.c,v 1.3 1995/08/04 02:15:28 duane Exp";/* * Contains a parser for Harvest's Summary Object Interchange Format (SOIF) * http://harvest.cs.colorado.edu/. * * There are a few known problems with this parser: * - WAIS parses files line-by-line, so this does not support binary data * - Parsing SOIF attributes is not exact, because the parser needs * to guess on every line with which attribute to associate the data. * - Only a total of MAX_FIELD (256) different attributes are supported. * Any SOIF attribute after MAX_FIELD is ignored (on a first-come, * first-serve basis). * - It's slow, since it needs to test to see if the current line * starts a new attribute or not (worst case is 2 sscanf's per line). * - Only works with SOIF from a Broker (e.g., attribute names are all * lower case). * - All WAIS field names cannot (evidently) contain any * non-alpha-numeric characters. * * However, this parser does work well for most SOIF data... * * Written by Darren Hardy, hardy@cs.colorado.edu, April 1995 */#define localParser_c#include <string.h>#include <ctype.h>#include "localParsers.h"/* define to allow only SOIF attributes that are all lower-case *//* Harvest Brokers only use lower-case attributes */#ifndef SOIF_LOWERCASE_ONLY#define SOIF_LOWERCASE_ONLY#endif/* define for debugging output to stderr */#ifndef SOIF_DEBUG#undef SOIF_DEBUG#endif/* define to strip SOIF attribute names of non-alpha-numeric characters */#ifndef SOIF_STRIP_ATTR#define SOIF_STRIP_ATTR#endif/* * verify_attr() - Returns non-zero if the attr is a valid SOIF attr. */static int soif_verify_attr(char *attr){ char *p; int ok = 0; /* needs to have a (lowercase) alpha character */ for (p = attr; *p; p++) {#ifdef SOIF_LOWERCASE_ONLY if (islower((unsigned char) *p)) {#else if (isalpha((unsigned char) *p)) {#endif ok = 1; break; } } /* needs to be at least 2 characters long */ if (attr[0] == '\0' || attr[1] == '\0') { ok = 0; } return(ok);}/* strips attr of all non-alpha-numeric characters */static void strip_attr(attr)char *attr;{ int i,j; char s[BUFSIZ]; for (i = j = 0; attr[i]; i++) if (isalnum((unsigned char) attr[i])) s[j++] = attr[i]; s[j] = '\0'; strcpy(attr, s);}/* maps attribute name into fieldID */static char *soif_attrtofid[MAX_FIELD+1];/* * grab_fieldID() - Returns the fieldID number for attr; if it's not * registered already, then it returns the next available fieldID */static long soif_grab_attrtofid(char *attr){ long i, min_field; static int initialized = 0; if (!initialized) { initialized = 1; memset(soif_attrtofid, '\0', sizeof(char *) * (MAX_FIELD+1)); } for (i = min_field = MIN_FIELD; i <= MAX_FIELD; i++) { if (soif_attrtofid[i] != NULL) { min_field = i; if (!strcmp(attr, soif_attrtofid[i])) { return (i); } } } if (min_field == MAX_FIELD && soif_attrtofid[MAX_FIELD] != NULL) { return(MIN_FIELD); /* error: too many attributes */ } soif_attrtofid[++min_field] = safeStrdup(attr);#ifdef SOIF_DEBUG fprintf(stderr, "ADDED %d %s\n", min_field, soif_attrtofid[min_field]);#endif return (min_field);}/*---------------------------------------------------------------------------*/fieldInfo*soifField(char* line,long* fieldID){ char attr[8192]; int x, vsize; static long previous_fieldID = MIN_FIELD; attr[0] = '\0'; x = vsize = -1; /* sscanf's are expensive, try quickies first */ /* assumes all attribute names are lowercase as is in Brokers */ if (((line[0] == 'e' && /* quickie */#ifdef SOIF_LOWERCASE_ONLY sscanf(line,"embed<%d>-%[a-z0-9-]{%d}:\t",&x,attr,&vsize)==3)||#else sscanf(line,"embed<%d>-%[A-Za-z0-9-]{%d}:\t",&x,attr,&vsize)==3)||#endif (isalnum((unsigned char) line[0]) && /* quickie, always char or digit */#ifdef SOIF_LOWERCASE_ONLY sscanf(line, "%[a-z0-9-]{%d}:\t", &attr, &vsize) == 2)) &&#else sscanf(line, "%[A-Za-z0-9-]{%d}:\t", &attr, &vsize) == 2)) &&#endif soif_verify_attr(attr)) {#ifndef SOIF_LOWERCASE_ONLY{ /* convert attribute to all lowercase */ char *p; for (p = attr; *p; p++) *p = tolower((unsigned char) *p);}#endif#ifdef SOIF_STRIP_ATTR strip_attr(attr);#endif *fieldID = soif_grab_attrtofid(attr);#ifdef SOIF_DEBUG fprintf(stderr, "GRABBED (embed %d) %d %s\n", x == -1 ?0:1, *fieldID, attr);#endif previous_fieldID = *fieldID; if (fieldIsDefined[*fieldID] == false) { fieldInfo* fd = makeFieldInfo(); fieldIsDefined[*fieldID] = true; fd->fieldID = *fieldID; if (*fieldID == MIN_FIELD) { fd->names = collecting(fd->names,safeStrdup("body")); fd->description = safeStrdup("body"); } else { fd->names = collecting(fd->names,safeStrdup(attr)); fd->description = safeStrdup(attr); } fd->fieldType = TEXT_FIELD; fd->lowerBoundSet = false; fd->upperBoundSet = false; return(fd); } return(NULL); } *fieldID = previous_fieldID; /* associate data with prev attr */ return(NULL);}/*---------------------------------------------------------------------------*/longsoifGatherHeadline(char* line){ char *s; /* this is called on every line so be quick about it... */ if (line[0] == '@' && theHeadline[0] == '\0') { /* save away the URL */ if ((s = strchr(line, '{')) == NULL) return IN_BODY; else s++; while (isspace((unsigned char) *s)) s++; strncpy(theHeadline,s,MAX_HEADLINE_LEN); return(IN_HEADLINE); } return IN_BODY;}/*---------------------------------------------------------------------------*/void soifFinishHeadline(char* headline,char* filename){ char *s; if (theHeadline[0]) { /* Use the OBJxxx first in the headline */ if ((s = strrchr(filename, '/')) == NULL) { s = filename; } else { s++; } /* Headline is ``OBJxxx URL'' */ strncpy(headline,s,MAX_HEADLINE_LEN); strcat(headline, " "); strncat(headline,theHeadline,MAX_HEADLINE_LEN-strlen(s)-2); theHeadline[0] = '\0'; } else { strcpy(headline,"Unknown Headline"); }}/*---------------------------------------------------------------------------*/parserInfo localParserList[] = { defParser("soif", "Harvest's Summary Object Interchange Format", "TEXT", true, /* true to index contents, false otherwise */ NULL, soifField, soifGatherHeadline, NULL, NULL, NULL, NULL, soifFinishHeadline, NULL ), defParser(NULL, NULL, NULL, false, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL )};/* The localParserList is a null-terminated list of defParser structures; DO NOT DELETE the NULL structure at the end! *//*---------------------------------------------------------------------------*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -