📄 build_in.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. *//* ./glimpse/index/build_in.c *//* --------------------------------------------------------------    build_index():  build an index list from a set of files.    INPUT: a set of file names	   char **name_list[];	   a partition table	   int p_table[];    OUTPUT: an index list;	   char *index_list;	   the index list is a char string as follows:	   each entry of the index list contains two parts:	   name and indices, where name is an ascii character string,           and indices is a list of short integer. (unsigned char)           We use newline as a 'record delimiter' (a 'record is logically	   a word associated with its indices), and WORD_END_MARK to separate	   a word from its list of indices (s.t. fscanf %s works).	   Since we restrict the max number of partitions to be 255.	   a byte is enough to represent the index value. Note that there	   cannot be a partition #ed '\n'.	   An example index list: (in logical view)           this 12 19 \n is 9 17 12 18 19 \n an 7 12 \n example 16 \n-----------------------------------------------------------------------*/#include "glimpse.h"#define debugt#define BINARY 1/* #define SW_DEBUG  the original sw output of index set *//* This flag must always be defined: it is used only in build_in.c *//* #define UDI_DEBUG  the original outputs of each indexed file *//* Some variables used throughout */#if	BG_DEBUGextern FILE  *LOGFILE; 	/* file descriptor for LOG output */#endif	/*BG_DEBUG*/extern FILE  *STATFILE;	/* file descriptor for statistical data about indexed files */extern FILE  *MESSAGEFILE;	/* file descriptor for important messages meant for the user */extern char  INDEX_DIR[MAX_LINE_LEN];extern char  sync_path[MAX_LINE_LEN];extern struct stat istbuf;extern struct stat excstbuf;extern struct stat incstbuf;void insert_h();void insert_index();extern int ICurrentFileOffset;extern int NextICurrentFileOffset;/* Some options used throughout */extern int OneFilePerBlock;extern int IndexNumber;extern int CountWords;extern int StructuredIndex;extern int InterpretSpecial;extern int total_size;extern int MAXWORDSPERFILE;extern int NUMERICWORDPERCENT;extern int AddToIndex;extern int DeleteFromIndex;extern int FastIndex;extern int BuildDictionary;extern int BuildDictionaryExisting;extern int CompressAfterBuild;extern int IncludeHigherPriority;extern int FilenamesOnStdin;extern int UseFilters;extern int ByteLevelIndex;extern int RecordLevelIndex;extern int StoreByteOffset;extern int rdelim_len;extern char rdelim[MAX_LINE_LEN];extern char old_rdelim[MAX_LINE_LEN];/* int IndexUnderscore; */extern int IndexableFile;extern int MAX_INDEX_PERCENT;extern int MAX_PER_MB;extern int I_THRESHOLD;extern int usemalloc;extern int BigHashTable;extern int AddedMaxWordsMessage;extern int AddedMixedWordsMessage;extern int  icount; /* count the number of my_malloc for indices structure */extern int  hash_icount; /* to see how much was added to the current hash table */extern int  save_icount; /* to see how much was added to the index by the current file */extern int  numeric_icount; /* to see how many numeric words were there in the current file */extern int num_filter;extern int filter_len[MAX_FILTER];extern CHAR *filter[MAX_FILTER];extern CHAR *filter_command[MAX_FILTER];extern int REAL_PARTITION, REAL_INDEX_BUF, MAX_ALL_INDEX, FILEMASK_SIZE;extern int mask_int[32];struct indices	*deletedlist = NULL;char **name_list[MAXNUM_INDIRECT];unsigned int *disable_list = NULL;int *size_list[MAXNUM_INDIRECT];	/* temporary area to store size of each file */extern int  p_table[MAX_PARTITION];int  p_size_list[MAX_PARTITION];	/* sum of the sizes of the files in each partition */int part_num;   /* number of partitions */extern int memory_usage;/* borrowd from getword.c */extern int PrintedLongWordWarning;extern int indexable_char[256];extern char *getword();extern int file_num;extern int old_file_num;extern int attr_num;extern int  bp;                          /* buffer pointer */extern unsigned char word[MAX_WORD_BUF];extern int FirstTraverse1;extern struct  indices *ip;extern int HashTableSize;struct token **hash_table; /*[MAX_64K_HASH];*/build_index(){	int	i;	if (AddToIndex || FastIndex) {		FirstTraverse1 = OFF;	}	if ((total_size < LIMIT_64K_HASH*1024*1024) || !BigHashTable) {		hash_table = (struct token **)my_malloc(sizeof(struct token *) * MAX_64K_HASH);		HashTableSize = MAX_64K_HASH;	}	else {		hash_table = (struct token **)my_malloc(sizeof(struct token *) * MAX_256K_HASH);		HashTableSize = MAX_256K_HASH;	}        build_hash();        /* traverse1(); ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ removed on oct/8/96, bgopal, to see if crazysegvs disappear on lec */        return;}/* ----------------------------------------------------------------------traverse()function: traverse the hash list of indices = a hash list is a array oflinked list, where every node in a linked list contains a word whosehash_value is the same.While traversing the hash list, traverse() output a stream of index list.It also frees the memory used in hash_table.------------------------------------------------------------------------*/#define CRAZYSEGV	0traverse(){    int numseencount = 0;    int numelements;    int numonline;    int  i, j, attribute;    struct token *tp, *tp_old;    struct indices *ip, *ip_old;#if	!CRAZYSEGV    FILE   *f_out;#else    unsigned char onechar[4];    unsigned char onestring[MAX_LINE_LEN];    int	f_out;#endif    char   s[MAX_LINE_LEN];    char   *word;    int	x = -1, y=0, diff, temp, even_words=1;	/* 0 is an even number */    int fputcerr; /* added by dgh 5-8-96 */#ifdef	SW_DEBUG    printf("in traverse()\n");#endif    sprintf(s, "%s/%s", INDEX_DIR, I2);#if	!CRAZYSEGV    if ((f_out = fopen(s, "w")) == NULL) {#else    if ((f_out = open(s, O_WRONLY|O_CREAT|O_TRUNC, 0600)) == -1) {#endif	fprintf(stderr, "Cannot open %s for writing\n", s);	exit(2);    }    for(i=0; i<HashTableSize; i++) {        if(hash_table[i] == NULL) continue;        tp = hash_table[i];        tp_old = tp;        while(tp != NULL) {   /* traverse the token list */	    word = tp->word;            while(*word != '\0') {  /* copy the word to output */#if	!CRAZYSEGV		fputcerr=fputc(*word++, f_out);/* change from putc to fputc */				       /* by dgh, 8-5-96 */#else		write(f_out, word, 1);		word++;#endif            }	    /* Look for stop lists */	    if (OneFilePerBlock && !ByteLevelIndex && (file_num > MaxNum8bPartition) && (tp->totalcount > (file_num * MAX_INDEX_PERCENT / 100))) {#if	!CRAZYSEGV		putc(ALL_INDEX_MARK, f_out);#else		onechar[0] = ALL_INDEX_MARK;		write(f_out, onechar, 1);#endif		if (StructuredIndex) {  /* force big-endian as usual */		    attribute = encode16b(tp->attribute);#if	!CRAZYSEGV		    putc((attribute&0x0000ff00)>>8, f_out);		    putc((attribute&0x000000ff), f_out);#else		    onechar[0] = (attribute&0x0000ff00)>>8;		    onechar[1] = (attribute&0x000000ff);		    write(f_out, onechar, 2);#endif		}#if	!CRAZYSEGV		putc(DONT_CONFUSE_SORT, f_out);#else		onechar[0] = DONT_CONFUSE_SORT;		write(f_out, onechar, 1);#endif		goto next_token;	    }	    else if (ByteLevelIndex && (tp->totalcount > ( (((total_size>>20) > 0) && ((total_size>>20)*MAX_PER_MB < MAX_ALL_INDEX)) ? ((total_size>>20) * MAX_PER_MB) : MAX_ALL_INDEX) )) {#if	!CRAZYSEGV		putc(ALL_INDEX_MARK, f_out);#else		onechar[0] = ALL_INDEX_MARK;		write(f_out, onechar, 1);#endif		if (StructuredIndex) {  /* force big-endian as usual */		    attribute = encode16b(tp->attribute);#if	!CRAZYSEGV		    putc((attribute&0x0000ff00)>>8, f_out);		    putc((attribute&0x000000ff), f_out);#else		    onechar[0] = (attribute&0x0000ff00)>>8;		    onechar[1] = (attribute&0x000000ff);		    write(f_out, onechar, 2);#endif		}#if	!CRAZYSEGV		putc(DONT_CONFUSE_SORT, f_out);#else		onechar[0] = DONT_CONFUSE_SORT;		write(f_out, onechar, 2);#endif		goto next_token;	    }#if	!CRAZYSEGV	    putc(WORD_END_MARK, f_out);#else	    onechar[0] = WORD_END_MARK;	    write(f_out, onechar, 1);#endif	    if (StructuredIndex) {  /* force big-endian as usual */		attribute = encode16b(tp->attribute);#if	!CRAZYSEGV		putc((attribute&0x0000ff00)>>8, f_out);		putc((attribute&0x000000ff), f_out);#else		    onechar[0] = (attribute&0x0000ff00)>>8;		    onechar[1] = (attribute&0x000000ff);		    write(f_out, onechar, 2);#endif	    }	    numonline = 0;	    x = -1;	    y = 0;	    even_words = 1;	    ip = tp->ip;	/* traverse the indices list */            ip_old = ip;	    numelements = 0;            while(ip != NULL) {		numelements ++;		if (CountWords) {#if	!CRAZYSEGV		    fprintf(f_out, "%d", ip->offset[0]);#else		    sprintf(onestring, "%d", ip->offset[0]);		    write(f_out, onestring, strlen(onestring));#endif		}		else {		    if (ByteLevelIndex) {			for (j=0; j < INDEX_SET_SIZE; j++) {			    if (ip->index[j] == INDEX_ELEM_FREE) continue;			    if ((ip->offset[j] <= y) && (y > 0) && (x == ip->index[j])) {	/* consecutive offsets not increasing in same file! */				fprintf(stderr, "ignoring (%d, %d) > (%d, %d)\n", x, y, ip->index[j], ip->offset[j]);				continue;	/* error! */			    }			    if (numonline >= MAX_PER_LINE) {				/* terminate current line since it is too late to put ALL_INDEX_MARK now ... Unfortunate since sort is screwedup */#if	!CRAZYSEGV				putc('\n', f_out);#else			        onechar[0] = '\n';			        write(f_out, onechar, 1);#endif#if	0				putc('\n', stdout);#endif	/*0*/				word = tp->word;				while(*word != '\0') {  /* copy the word to output */#if	!CRAZYSEGV				    putc(*word++, f_out);#else				    write(f_out, word, 1);				    word ++;#endif				}#if	!CRAZYSEGV				putc(WORD_END_MARK, f_out);#else			        onechar[0] = WORD_END_MARK;			        write(f_out, onechar, 1);#endif				if (StructuredIndex) {  /* force big-endian as usual */				    attribute = encode16b(tp->attribute);#if	!CRAZYSEGV				    putc((attribute&0x0000ff00)>>8, f_out);				    putc((attribute&0x000000ff), f_out);#else				    onechar[0] = (attribute&0x0000ff00)>>8;				    onechar[1] = (attribute&0x000000ff);				    write(f_out, onechar, 2);#endif				}				numonline = 0;				x = -1;	/* to force code below to output it as if it is a fresh file */				y = 0;	/* must output first offset as is, rather than difference */			    }			    if (x != ip->index[j]) {				if (x != -1) {				    temp = encode8b(0);#if	!CRAZYSEGV				    putc(temp, f_out);	/* can never ordinarily happen since ICurrentFileOffset is always ++d => delimiter (unless RecordLevelIndex) */#else				    onechar[0] = temp;				    write(f_out, onechar, 1);#endif				}				if (file_num <= MaxNum8bPartition) {				    x = encode8b(ip->index[j]);#if	!CRAZYSEGV				    putc(x&0x000000ff, f_out);#else				    onechar[0] = x&0x000000ff;				    write(f_out, onechar, 1);#endif				}				else if (file_num <= MaxNum16bPartition) {				    x = encode16b(ip->index[j]);#if	!CRAZYSEGV				    putc((x&0x0000ff00)>>8, f_out);				    putc(x&0x000000ff, f_out);#else				    onechar[0] = (x&0x0000ff00)>>8;				    onechar[1] = x&0x000000ff;				    write(f_out, onechar, 2);#endif				}				else {				    x = encode24b(ip->index[j]);#if	!CRAZYSEGV				    putc((x&0x00ff0000)>>16, f_out);				    putc((x&0x0000ff00)>>8, f_out);				    putc(x&0x000000ff, f_out);#else				    onechar[0] = (x&0x00ff0000)>>16;
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -