📄 defs.h

📁 harvest是一个下载html网页得机器人
💻 H
字号:
/* Copyright (c) 1994 Burra Gopal, Udi Manber.  All Rights Reserved. *//************************************************************************** * defs.h:	contains definitions for our static/dictionary based	  * *		compression scheme that is tailored for very fast search. * **************************************************************************/#ifndef	_DEFS_H_#define _DEFS_H_#include <sys/types.h>#include <sys/stat.h>#include <ctype.h>#include "glimpse.h"#undef COMP_SUFFIX#undef DEF_STRING_FILE#undef DEF_HASH_FILE#undef DEF_FREQ_FILE#undef SIGNATURE_LEN#define MIN_WORD_LEN		1		/* smaller words are not indexed: heuristics like special_texts etc. must be used: verbatim is good enough */#define HASH_TABLE_SIZE		MAX_64K_HASH#define SMALL_HASH_TABLE_SIZE	MAX_4K_HASH#define HASH_ENTRY_SIZE		32 		/* hash-file stores: name of len=24, a 5 digit int, a ' ' + a '\n' = 31 bytes + some padding once in a while */#define DEF_BLOCKSIZE		4096		/* I/O unit size = OS page size */#define MIN_BLOCKSIZE		512		/* granularity for above and below */#define HASH_FILE_BLOCKS	(HASH_TABLE_SIZE * HASH_ENTRY_SIZE / MIN_BLOCKSIZE)#define STRING_FILE_BLOCKS	(HASH_TABLE_SIZE * MAX_WORD_LEN / MIN_BLOCKSIZE)#define MAX_SPECIAL_CHARS	32		/* Maximum # of special characters used during compress */#define DEF_SPECIAL_WORDS	32		/* Special words for which 1B codes are reserved */#define COMP_ATLEAST		10		/* At least 10% compression is needed */#define COMP_SUFFIX		".CZ"		/* Common suffix used for all compressed files: IT INCLUDES THE '.' !!! */#define DEF_INDEX_FILE		INDEX_FILE	/* same as glimpse's */#define DEF_STRING_FILE		".glimpse_uncompress"#define DEF_HASH_FILE		".glimpse_compress"#define DEF_FREQ_FILE		".glimpse_quick"#define DEF_THRESHOLD		16		/* 256? default for min bytes to be coverd before storing in hash table */#define MAX_THRESHOLD		65535		/* MAX_WORDS*MAX_THRESHOLD must be < 2**32 - 1 = maxoffset = maxdiskspace = integer */#define MAX_LSB			254		/* 256 - |{'\0', '\n'}| */#define DEF_MAX_WORDS		(MAX_LSB*MAX_LSB)#define SAMPLE_SIZE		8192		/* amount of data read to determine file-type: NOT CALLED FOR STDIN! */#define SIGNATURE_LEN		16		/* to avoid calling strlen: including \0! */typedef struct _hash_entry {	struct _hash_entry *next;	char *word;				/* string itself */	union {		int	offset;			/* offset into the dictionary file: used only while building compress's dict from glimpse's dict */		struct {			short	freq;		/* number of times the word occurs -- provided it is in the dictionary */			short	index;		/* index into the string table */		} attribute;			/* once freq > THRESHOLD, its just an index into the string table: used only while compressing a file */	} val;} hash_entry;/* * The total number of special characters (1..4) CANNOT exceed MAX_SPECIAL_CHARS. * The arrangement is as follows: * 1. SPECIAL_TEXTS * 2. SPECIAL_SEPARATORS * 3. SPECIAL_DELIMITERS * 4. VERBATIM * 5. SPECIAL_WORDS * Any rearrangement of these can be done provided the BEGIN/END values * are defined properly: the NUMs remain the same. */#define BEGIN_SPECIAL_CHARS	1		/* character 0 is never a part of any code */#define END_SPECIAL_CHARS	30		/* Not including begin/end verbatim *//* Special delimiters are text-sequences which can come after a word instead of a blank: this is a subset of the above with '\n' and '\t' */#define EASY_NUM_SPECIAL_DELIMITERS	8	/* numbered from 1 .. 8 */#define HARD_NUM_SPECIAL_DELIMITERS	9	/* extra: a special kind of newline */#define SPECIAL_DELIMITERS		{ '.', ',', ':', '-', ';', '!', '"', '\'', '\n'}#define BEGIN_SPECIAL_DELIMITERS	BEGIN_SPECIAL_CHARS#define EASY_END_SPECIAL_DELIMITERS	9#define HARD_END_SPECIAL_DELIMITERS	10/* Special separators are things that can separate two words: they are 2blanks, 2tabs or 2newlines */#define NUM_SEPARATORS		7		/* numbered from 10 .. 16 */#define NEWLINE			'\n' 		/* = HARD_END_SPECIAL_DELIMITERS --> carefully chosen so that this is TRUE !!!! Speeds up searches */#define NOTBLANK		(NEWLINE + 1)	/* acts like unputc(' ') if char after a word != blk OR sp-delims */#define BLANK			(NOTBLANK + 1)#define TAB			(NOTBLANK + 2)#define TWOBLANKS		(NOTBLANK + 3)	/* Beginning of a sentence */#define TWOTABS			(NOTBLANK + 4)	/* Indentation */#define TWONEWLINES		(NOTBLANK + 5)	/* Beginning of a paragraph */#define BEGIN_SEPARATORS	10#define END_SEPARATORS		17/* * An alternate way would be to have a code for BLANK and NBLANKS, TAB and NTABS, and, NEWLINE and NNEWLINES: * in each of these cases, the byte occuring immediately next would determine the number of BLANKS/TABS/NEWLINES. * Though this works for a general number of cases, it needs two bytes of encoding: which makes us * wonder whether those cases occur commonly enough to waste two bytes to encode two blanks (common). * The present encoding guarantees 50% compression for any sequence of separators anyway, and is much simpler. *//* Special texts are text-sequences which have a 1 byte codes associated with them: these appear first among the special things */#define NUM_SPECIAL_TEXTS	13		/* numbered from 17 .. 29 */#define SPECIAL_TEXTS		{ '.', ',', ':', '-', ';', '!', '"', '\'', '#', '$', '%', '(', ')'}	/* Could have used ?, @ and & too */#define BEGIN_SPECIAL_TEXTS	17#define END_SPECIAL_TEXTS	30/* Characters for literal text */#define BEGIN_VERBATIM		30#define END_VERBATIM		31#define EASY_ONE_VERBATIM	EASY_END_SPECIAL_DELIMITERS#define HARD_ONE_VERBATIM	BEGIN_VERBATIM	/* Is not an ascii char since ascii is 32.. *//* BEGIN and END SPECIAL_WORDS are variables */#if	0/* THIS WON'T REALLY HELP SINCE SOURCE CODE RARELY HAS COMMON WORDS: KEYWORDS ARE VERY SMALL SO THEY HARDLY GIVE ANY COMPRESSION */char special_program_chars[] = { '.', ',', ':', '-', '!', ';', '?', '+', '/', '\'', '"', '~', '`', '&', '@', '#', '$', '%', '^', '*', '=', '(', ')', '{', '}', '[', ']', '_', '|', '\\', '<', '>' };#endif	/*0*//* * Common exported functions. */unsigned short encode_index();unsigned short decode_index();unsigned int mygetc();int is_little_endian();int build_string();int build_hash();int dump_hash();int dump_string();int get_word_from_offset();int dump_and_free_string_hash();hash_entry *insert_hash();hash_entry *get_hash();int hash_it();char * tescapesinglequote();/* * The beauty of this allocation scheme is that "free" does not need to be implemented! * The total memory occupied by both the string and hash tables is appx 1.5 MB */#define hashfree(h)	if (usemalloc) free(e);#define hashalloc(e) \{\	if (usemalloc) (e) = (hash_entry *)malloc(sizeof(hash_entry));\	else {\		if (free_hash == NULL) free_hash = (hash_entry *)malloc(sizeof(hash_entry) * DEF_MAX_WORDS);\		if (free_hash == NULL) (e) = NULL;\		else (e) = ((next_free_hash >= DEF_MAX_WORDS) ? (NULL) : (&(free_hash[next_free_hash ++])));\	}\	if ((e) == NULL) {fprintf(stderr, "Out of memory in cast-hash-table!\n"); exit(2); }\}#define strfree(s)	if (usemalloc) free(s);/* called ONLY in the build procedure in which we can afford to be slow and do an strcpy since sizes of words are not determined: hardcoded in build_hash() */#define stralloc(s, len) \{\	if (usemalloc) (s) = (char *)malloc(len);\	else {\		if (free_str == NULL) free_str = (char *)malloc(AVG_WORD_LEN * DEF_MAX_WORDS);\		if (free_str == NULL) (s) = NULL;\		else (s) = ((next_free_str >= AVG_WORD_LEN * DEF_MAX_WORDS) ? (NULL) : (&(free_str[next_free_str]))); next_free_str += (len);\	}\	if ((s) == NULL) {fprintf(stderr, "Out of memory in cast-string-table!\n"); exit(2); }\}/* There is no equivalent strtablealloc since it is hardcoded into build_string and is not used anywhere else *//* Some flags corr. to user options: avoid global variables for options, pass flags as parameters */#define TC_EASYSEARCH	0x1#define TC_UNTILNEWLINE	0x2#define TC_REMOVE	0x4#define TC_OVERWRITE	0x8#define TC_RECURSIVE	0x10#define TC_ERRORMSGS	0x20#define TC_SILENT	0x40#define TC_NOPROMPT	0x80#define TC_FILENAMESONSTDIN 0x100#define CAST_VERSION	"1.0"#define CAST_DATE	"1994"#endif	/*_DEFS_H_*/
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -