📄 glimpse.h

📁 harvest是一个下载html网页得机器人
💻 H
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. *//* ./glimpse/index/pirs.h */#ifndef _GLIMPSE_H_#define _GLIMPSE_H_#include <stdio.h>#include <stdlib.h>#include <string.h>#include <sys/types.h>/*#include <sys/uio.h>*/#include <sys/types.h>#include <sys/stat.h>#include <unistd.h>#include <fcntl.h>#undef log#include "agrep.h"#ifndef S_ISREG/* #define S_ISREG(mode) (0100000&(mode)) */#define S_ISREG(mode)   (((mode) & (_S_IFMT)) == (_S_IFREG))#endif#ifndef S_ISDIR/* #define S_ISDIR(mode) (0040000&(mode)) */#define S_ISDIR(mode)   (((mode) & (_S_IFMT)) == (_S_IFDIR))#endif#define IC_PORTRELEASE	20	/* time till used TCP port is released */#ifndef ON#define ON		1#endif#ifndef OFF#define OFF		0#endif#ifndef CHAR#define CHAR 		unsigned char#endif#define MAX_INCLUSIVE	256	/* max number of inclusive patterns for				   files to be indexed even if filetype.c				   says otherwise. */#define MAX_EXCLUSIVE	256 	/* max number of exclusive patterns 				   for not_to_be_indexed files  */#define MAX_FILTER	256	/* max number of filter patterns */#define DEF_I_THRESHOLD	40000	/* 100000 originally, debugging 10000 */#define AVG_OCCURRENCES	8	/* #of places a word occurs on average: sizeof(.glimpse_partitions)/`wc -l .glimpse_index`: divisible by INDEX_SET_SIZE */#define MAX_LIST	0177777#define DEFAULT_PART_SIZE (1 << 13)#define MAX_64K_HASH	(64*1024)#define MAX_256K_HASH	(256*1024)#define MAX_4K_HASH	(4*1024)#define DISKBLOCKSIZE	8192#define BLOCK_SIZE	(1024*64)#define MAX_PARTITION	255#define MaxNumPartition	250     /* it's not 255, since there is fragmentation*//* The idea behind our encoding is: dividend = divisor * quotient + remainder */#define MaxNum4bPartition (16 -  2)	/* since 10 and 0 can't be in LSB/MSB */#define MaxNum8bPartition (256 - 2)#define MaxNum12bPartition (MaxNum4bPartition*MaxNum8bPartition)#define MaxNum16bPartition (MaxNum8bPartition*MaxNum8bPartition)#define MaxNum24bPartition (MaxNum8bPartition*MaxNum16bPartition)#define MaxNum32bPartition (MaxNum8bPartition*MaxNum24bPartition)/* These help in encoding byte-level indices: 1st byte's top 2 bits tell the #of bytes - 1 in offset-difference encoding; offset-diff 0 => new file follows */#define MaxNum1BPartition (MaxNum8bPartition & 0x3f)			/* 62: top byte is 0x00 | x % MaxNum8bPartition === x; just encode x */#define MaxNum2BPartition (MaxNum1BPartition * MaxNum8bPartition)	/* top byte = 0x40 | x / MaxNum8bPartition; rest is x % ~; encode both separately */#define MaxNum3BPartition (MaxNum1BPartition * MaxNum16bPartition)	/* top byte = 0x80 | x / MaxNum16bPartition; rest is x % ~; encode both separately */#define MaxNum4BPartition (MaxNum1BPartition * MaxNum24bPartition)	/* top byte = 0xc0 | x / MaxNum24bPartition; rest is x % ~; encode both separately */#define DEF_NUMERIC_WORD_PERCENT 50	/* warn if > this many % of words added by file are numeric */#define MIN_WORDS		50	/* before we inform about numeric words */#define MAX_SEARCH_PERCENT	20	/* warn user if searching > this % of blocks */#define DEF_MAX_INDEX_PERCENT	80	/* if word in > 80%, say everywhere for one-file-per-block */#define DONT_CONFUSE_SORT 1#define WORD_END_MARK 	2#define ALL_INDEX_MARK	3		/* If this, then word is in > 60% of blocks */#define ATTR_END_MARK	4		/* After list of attributes before file offset/block numbers */#define FILE_END_MARK	' '		/* If InfoAfterFilename, end filename with this 					* Define as other than space to be able to index 					*   filenames with spaces in them.  Works with '\t'.					*  If using webglimpse, must match settings in 					*  makenh and webglimpse */#define AVG_WORD_LEN	12		/* average word length is 8-9 including '\0': have safety margin */#define MAX_NAME_SIZE   256#define MAX_NAME_LEN	MAX_NAME_SIZE#define MaxNameLength	MAX_NAME_SIZE#define MAX_LINE_SIZE	1024#define MAX_LINE_LEN	1024#define MAX_SORTLINE_LEN (MAX_LINE_LEN * 16)	/* Can be ((MaxNum16bPartition*sizeof(int)+MAX_NAME_LEN)*MAX_INDEX_PERCENT/100) in the worst case */#define MAX_NAME_BUF	MAX_NAME_SIZE#define MAX_WORD_SIZE	64	/* w/o '\0'; was 24 in 2.1 */#define MAX_WORD_LEN	MAX_WORD_SIZE#define MAX_WORD_BUF	80	/* was 32 in 2.1 */#define MAX_PAT		256  #define MAXNUM_INDIRECT	MaxNum8bPartition#define MAX_INDEX_BUF	(MAX_PARTITION + 1 + 2*MAX_WORD_BUF + 2)	/* index line length without OneFilePerBlock */#define DEF_REAL_INDEX_BUF	(MaxNum16bPartition  + 2*MAX_WORD_BUF + 2)	/* index line length with OneFilePerBlock *//* Must write fresh code to calculate these sets based by multiplying defaults below with round(file_num, MaxNum16bPartition) */#define DEF_FILESET_SIZE	MaxNum16bPartition	/* used when OneFilePerBlock is ON */#define DEF_FILEMASK_SIZE	(DEF_FILESET_SIZE/(8*sizeof(int)) + 4)	/* bit mask of files */#define DEF_REAL_PARTITION	(DEF_FILEMASK_SIZE + 4)	/* must be > MAX_PARTITION + 1 *//* block must be in 0..DEF_FILESET_SIZE-1, and integers should represent bit-masks */#define block2index(i)	(i/(8*sizeof(int)))#define block2mask(i)	(1<<(i%(8*sizeof(int))))	/* not used */#define round(x, y)	(((x)+(y)-1)/(y))#define FILES_PER_PARTITION(x)	(16 + round(x, MAX_PARTITION)*16)	/* 16 is minimum length of buffer: thereafter, allow noise upto 16 times average */#define LIST_GET(list, elem) ((list[(elem)/MaxNum16bPartition] == 0) ? (0) : (list[(elem)/MaxNum16bPartition][(elem)%MaxNum16bPartition]))#define LIST_SUREGET(list, elem) (list[(elem)/MaxNum16bPartition][(elem)%MaxNum16bPartition])#define LIST_ADD(list, elem, what, type) \{\	int	index = (elem /*+ 1*/)/MaxNum16bPartition;\	if (list[index] == NULL) {\		list[index] = (type *)malloc(sizeof(type)*MaxNum16bPartition);\		memset(list[index], '\0', sizeof(type)*MaxNum16bPartition);\	}\	LIST_SUREGET(list, elem) = what;\}#define DEFAULT_REGION_LIMIT 256	/* default limit for a record: for ByteLevelIndex: pattern is ignored since can't avoid false matches w/o search */#define MAX_REGION_LIMIT Max_record	/* max amount of space I am going allocate for a record bounded by a delimiter: was 16384! Fixed -bg */#define MAX_PER_LINE	(MAX_SORTLINE_LEN / 2)	/* #of words that can occur on one line before we split it up: not implemented at present */#define DEF_MAX_PER_MB	500	/* Maximum number of times a word should occur in a megabyte before we say its everywhere */#define DEF_ALL_INDEX	10000	/* Must be < DEF_MAX_ALL_INDEX */#define DEF_MAX_ALL_INDEX	(DEF_REAL_INDEX_BUF / 2)	/* THIS * 2 must be < DEF_REAL_INDEX_BUF to prevent seg-faults! *//* Default file names */#define FILTER_FILE	".glimpse_filters"#define ATTRIBUTE_FILE	".glimpse_attributes"#define INDEX_FILE	".glimpse_index"#define MINI_FILE	".glimpse_turbo"#define P_TABLE		".glimpse_partitions"#define NAME_LIST	".glimpse_filenames"#define NAME_LIST_INDEX	".glimpse_filenames_index"#define NAME_HASH	".glimpse_filehash"#define NAME_HASH_INDEX	".glimpse_filehash_index"#define DEF_TIME_FILE	".glimpse_filetimes"#define DEF_LOG_FILE	".glimpse_log"#define DEF_MESSAGE_FILE ".glimpse_messages"#define DEF_STAT_FILE	".glimpse_statistics"#define PROHIBIT_LIST	".glimpse_exclude"#define INCLUDE_LIST	".glimpse_include"#define DEBUG_FILE	".glimpse_debug"#define I2		".glimpse_tmpi2"#define I3		".glimpse_tmpi3"#define I1		".glimpse_tmpi1"#define O1		".glimpse_tmpo1"#define O2		".glimpse_tmpo2"#define O3		".glimpse_tmpo3"#define DEF_LOCK_FILE	".glimpse_lock"#define HARVEST_PREFIX	"glimpse"	/* so that Darren can filterout error messages a user should see from the stuff outputted by glimpse on an error */#define MASK_INT \{ 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, 0x00000040, 0x00000080,\  0x00000100, 0x00000200, 0x00000400, 0x00000800, 0x00001000, 0x00002000, 0x00004000, 0x00008000,\  0x00010000, 0x00020000, 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,\  0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000, 0x80000000\}#define INDEXABLE(c)	(indexable_char[c])#if	SFS_COMPAT#define IGNORED_SUFFIXES {".glimpse_filehash", ".glimpse_filehash.prev", ".glimpse_filehash_index", ".glimpse_filehash_index.prev", ".glimpse_filenames", ".glimpse_filenames.prev", ".glimpse_filenames_index", ".glimpse_filenames_index.prev", ".glimpse_filetimes", ".glimpse_index", ".glimpse_partitions", ".glimpse_statistics", ".glimpse_messages", ".glimpse_exclude", ".glimpse_include", ".glimpse_filters", ".glimpse_attributes", ".glimpse_turbo"}#define NUM_SUFFIXES	18#else#define IGNORED_SUFFIXES {"gz", "Z", "z", "zip", "o", "hqx", "tar", "glimpse_times", "glimpse_index", "glimpse_partitions"}#define NUM_SUFFIXES	10#endif#define EXTRACT_INFO_SUFFIX {".htm", ".html", ".shtm", ".shtml"}#define NUM_EXTRACT_INFO_SUFFIX	4/* Version and release year: same for glimpse and glimspeindex since glimpse HAS to interpret glimpseindex */#define GLIMPSE_VERSION	"4.12"#define GLIMPSE_DATE	"1999"#define GLIMPSE_EMAIL	"glimpse@cs.arizona.edu"#define GLIMPSE_URL	"http://glimpse.cs.arizona.edu/"/* Some extern functions used in structured queries */extern int attr_name_to_id(), attr_load_names(), attr_dump_names();extern char *attr_id_to_name();/* Data structures for hash-tables in build_in.c */struct  token {			/* each token stores a unique word and unique attribute */	        struct token *next_t;	/* keep it a pointer even with tokenalloc to keep build_in.c same */	char *word; 				struct indices *ip;	/* points to the head of the list of indices */	struct indices *lastip;	/* tail of this list = last elemet (for increasing order insertion) */	unsigned int attribute;	unsigned int totalcount;/* no. of indices structures in a token */        };#define INDEX_SET_SIZE	4#define INDEX_ELEM_FREE	(MaxNum24bPartition + 1)	/* can never be equal to a partition value */struct indices {	struct indices *next_i;	/* keep it a pointer even with indexalloc to keep build_in.c same */        /*unsigned*/ int index[INDEX_SET_SIZE]; 	/* changed from char, 31/3/94 */	/*unsigned*/ int offset[INDEX_SET_SIZE];	/* added 19/9/94 */	};/* Added 20/9/94 for get_index.c in glimpse (make it more efficient in space later) */struct offsets {	struct offsets *next;	int offset;	/* NOT unsigned!!! */	short	sign;	/* if 0, then indeterminate (bothways), 1 then +ve, -1 then -ve */	short	done;	/* if 0, then this did not have an intersection now, else it has had it */	};#define INDICES_PER_TOKEN	(AVG_OCCURRENCES/INDEX_SET_SIZE)	/* average no. of struct indices per struct token: purely empirical result :-) *//* Memory allocators: in io.c */extern char *my_malloc();extern int my_free();extern FILE *my_fopen();extern int my_open(), my_stat(), my_lstat();extern char *wordalloc();extern int wordfree();extern int allwordfree();extern struct indices *indicesalloc();extern int indicesfree();extern int allindicesfree();extern struct token *tokenalloc();extern int tokenfree();extern int alltokenfree();#define LIMIT_64K_HASH	50	/* size of total stuff to be indexed in MB after which 256K hash tables make more sense with the -B option */#define hashword(word, wordlen)	(((total_size < LIMIT_64K_HASH*1024*1024) || !BigHashTable) ? (hash64k(word, wordlen)) : (hash256k(word, wordlen)));/* * Just stores the word, wordlength and offset present in a line of the index in a structure (when made with -o or -b). * Doesn't store the attribute since we just need a hint into .glimpse_index from where agrep should begin search. */#define	WORD_SORTED	0#if	WORD_SORTEDstruct mini {	char	*word;	long	offset;};/* Region searched with strcmp. #of regions = mini_array_len = (`wc -l .glimpse_index` - 3) / WORDS_PER_REGION */#define WORDS_PER_REGION	128#else	/* WORD_SORTED */struct mini {	long	offset;};/* Range of each mini_array entry is words with same hash32k value => 32K offsets into the index need to be stored */#define MINI_ARRAY_LEN		(64*1024)#endif	/* WORD_SORTED *//* For incremental indexing only */typedef struct _name_hashelement {	struct _name_hashelement *next;	char	*name;	int	name_len;	int	index;} name_hashelement;/* * Limit on number of files is MaxNum24bPartition. To change it, you need * to add encode/decode code everywhere, INDEX_ELEM_FREE and MAXNUM_INDIRECT. * * Limit on number of attributes is MaxNum16bPartition. To change it, you * need to add encode/decode code everywhere. That is: merge_splits(), * save_data_structures(), traverse(), merge_in() and scanword() * in glimpseindex; get_set() in glimpse; and printx.c. * * No need to change any other data structures. *//* Names of various system commands used in glimpseindex: use mv/rm etc rather than rename()/unlink() since former don't return unless parent-dir is sync-ed */#define SYSTEM_SORT	"sort"	/* replace with different sort with longer lines. Later write a procedure for sort that doesn't need system() */#define SYSTEM_LS	"ls"#define SYSTEM_MV	"mv"	/* this doesn't work with SFS */#define SYSTEM_RM	"rm"	/* this doesn't work with SFS */#define SYSTEM_CAT	"cat"#define SYSTEM_HEAD	"head"#define SYSTEM_CP	"cp"#define SYSTEM_ECHO	"echo"#define SYSTEM_WC	"wc"#define SYSTEM_AWK	"awk"	/* used at present only in "cast" package */extern char *escapesinglequote();#endif /* _GLIMPSE_H_ */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -