📄 cast.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* Copyright (c) 1994 Burra Gopal, Udi Manber.  All Rights Reserved. *//* * cast.c:	main text compression routines. Exports tcompress() called *		from main() in read_out.c, and one other simple routine *		tcompressible_file(). This module can also be used from csearch.c. */#include "defs.h"#include <sys/time.h>#if defined(__NeXT__)                                      /* NeXT has no <utime.h> */struct utimbuf {        time_t actime;          /* access time */        time_t modtime;         /* modification time */};#else#include <utime.h>#endif#define ALNUMWORDS 1#define MYEOF	0xffffffffextern int RESERVED_CHARS;extern int MAX_WORDS;extern int SPECIAL_WORDS;extern int BEGIN_SPECIAL_WORDS;extern int END_SPECIAL_WORDS;extern int NUM_SPECIAL_DELIMITERS;extern int END_SPECIAL_DELIMITERS;extern int ONE_VERBATIM;extern int next_free_hash, next_free_str;extern hash_entry freq_words_table[MAX_WORD_LEN+2][256];		/* 256 is the maximum possible number of special words */extern char freq_words_strings[256][MAX_WORD_LEN+2];extern int freq_words_lens[256];extern char comp_signature[SIGNATURE_LEN];extern hash_entry *compress_hash_table[HASH_TABLE_SIZE];extern int usemalloc;/* initialize and load dictionaries */initialize_tcompress(hash_file, freq_file, flags)	char	*hash_file, *freq_file;	int	flags;{	FILE	*hashfp;	if (!initialize_common(freq_file, flags)) return 0;	next_free_hash = 0;	memset(compress_hash_table, '\0', sizeof(hash_entry *) * HASH_TABLE_SIZE);        if (MAX_WORDS == 0) return 1;	/* Load compress dictionary */	if ((hashfp = fopen(hash_file, "r")) == NULL) {		if (flags & TC_ERRORMSGS) {			fprintf(stderr, "cannot open cast-dictionary file: %s\n", hash_file);			fprintf(stderr, "(use -H to give a dictionary-dir or run 'buildcast' to make a dictionary)\n");		}		return 0;	}	if (!tbuild_hash(compress_hash_table, hashfp, -1)) {	/* read all bytes until end */		fclose(hashfp);		return 0;	}	fclose(hashfp);	return 1;}uninitialize_tcompress(){	int	i;	hash_entry *e, *t;	uninitialize_common();	if (usemalloc) {		for (i=0; i<HASH_TABLE_SIZE; i++) {			e = compress_hash_table[i];			while (e != NULL) {				t = e;				e = e->next;				free(t->word);				free(t);			}		}	}	memset(compress_hash_table, '\0', sizeof(hash_entry *) * HASH_TABLE_SIZE);	next_free_hash = next_free_str = 0;}/* TRUE if input file has been compressed already, FALSE otherwise */intalready_tcompressed(buffer, length, flags)	char	*buffer;	int	length;	int	flags;{	char	*sig = comp_signature;	if (!strncmp(buffer, sig, SIGNATURE_LEN - 1)) {		if (flags & TC_ERRORMSGS)			fprintf(stderr, "Already compressed,");		return 1;	}	return 0;}extern int initialize_common_done;/* TRUE if input file is an ascii file, FALSE otherwise */inttcompressible(buffer, num_read, flags)	char	*buffer;	int	num_read;	int	flags;{	if (!initialize_common_done) {		if (flags & TC_ERRORMSGS)			fprintf(stderr, "No cast-dictionary,");		return 0;	}        if(ttest_binary(buffer, num_read)) {		if (flags & TC_ERRORMSGS)			fprintf(stderr, "Binary data,");		return(0);	}        if(ttest_uuencode(buffer, num_read)) {		if (flags & TC_ERRORMSGS)			fprintf(stderr, "UUEncoded data,");		return(0);	}        if(ttest_postscript(buffer, num_read)) {		if (flags & TC_ERRORMSGS)			fprintf(stderr, "Postscript data,");		return(0);	}	if (already_tcompressed(buffer, num_read, flags)) return 0;	return(1);}tcompressible_file(name, flags)	char	*name;	int	flags;{	char	buf[SAMPLE_SIZE + 2];	int	num;	FILE	*fp = my_fopen(name, "r");	if (!initialize_common_done) {		if (flags & TC_ERRORMSGS)			fprintf(stderr, "No cast-dictionary,");		if (fp != NULL) fclose(fp);		return 0;	}	if (fp == NULL) return 0;	num = fread(buf, 1, SAMPLE_SIZE, fp);	fclose(fp);	return(tcompressible(buf, num, flags));}tcompressible_fp(fp, flags)	FILE	*fp;	int	flags;{	char	buf[SAMPLE_SIZE + 2];	int	num;	if (!initialize_common_done) {		if (flags & TC_ERRORMSGS)			fprintf(stderr, "No cast-dictionary,");		return 0;	}	if (fp == stdin) return 1;	num = fread(buf, 1, SAMPLE_SIZE, fp);	return(tcompressible(buf, num, flags));}/* -------------------------------------------------------------------------tgetword():get a word from stream pointed to by fp: a "word" is an ID = a stream ofalphanumeric characters beginning with an alphanumeric char and ending witha non-alphanumeric character. The character following the word is returned,and the file pointer points to THIS character in the input stream. If thereis no word beginning at the current position of the file pointer, tgetwordsimply behaves like getc(), i.e., just returns the character read. If theword is too long, then it fills up all the bytes it can and returns thecharacter it could not fill up.To read a series of words without doing an ungetc() for the extra characterread by tgetword, the caller can set *length to 1 and word[0] to the characterreturned by tgetword. This can make compress work even if infile = stdin.--------------------------------------------------------------------------*/unsigned inttgetword(fp, buf, maxinlen, lenp, word, length)	FILE	*fp;	char	*buf;	int	maxinlen;	int	*lenp;	char	*word;	int	*length;{	unsigned int	c;#if	!ALNUMWORDS	if (*length > 0){		c = (unsigned char)word[*length - 1];		if (!isalpha(c)) goto not_alpha;		else goto alpha;	}	if ((c = mygetc(fp, buf, maxinlen, lenp)) == MYEOF) return MYEOF;	if (!isalpha(c)) {	/* this might be a number */		if (!isdigit(c)) return c;		word[*length] = c;		(*length) ++;		word[*length] = '\0';	not_alpha:		while(isdigit(c = mygetc(fp, buf, maxinlen, lenp))) {			if (*length >= MAX_NAME_LEN) return c;			word[*length] = c;			(*length) ++;			word[*length] = '\0';		}		return c;	}	else {	/* this might be a dictionary word */		word[*length] = c;		(*length) ++;		word[*length] = '\0';	alpha:		while(isalnum(c = mygetc(fp, buf, maxinlen, lenp))) {			if (*length >= MAX_NAME_LEN) return c;			word[*length] = c;			(*length) ++;			word[*length] = '\0';		}		return c;	}#else	/*!ALNUMWORDS*/	if (*length > 0){		c = word[*length - 1];	}	else {		if ((c = mygetc(fp, buf, maxinlen, lenp)) == MYEOF) return MYEOF;		if (!isalnum(c)) return c;		word[(*length)++] = c;		word[*length] = '\0';	}	while(((c = mygetc(fp, buf, maxinlen, lenp)) != MYEOF) && (isalnum(c))) {		if (*length >= MAX_NAME_LEN) return c;		word[*length] = c;		(*length) ++;		word[*length] = '\0';	}	return c;#endif	/*!ALNUMWORDS*/}/*--------------------------------------------------------------------Skips a series of characters of the type skipc and sets the number ofcharacters skipped. Used to compress multiple blanks, tabs & newlines.It returns the first character not equal to skipc. If there are nocharacters beginning at the current location of the file pointerwhich are equal to skipc, this function simply behaves as getc().---------------------------------------------------------------------*/intskip(fp, buf, maxinlen, lenp, skipc, skiplen)	FILE	*fp;	char	*buf;	int	maxinlen;	int	*lenp;	int	skipc;	int	*skiplen;{	unsigned int	c;	*skiplen = 1;	/* c has already been read! */	while((c = mygetc(fp, buf, maxinlen, lenp)) == skipc) (*skiplen) ++;	return c;}/* defined in misc.c */extern char special_texts[];extern char special_delimiters[];intget_special_text_index(c)	unsigned int	c;{	int	i;	for(i=0; i<NUM_SPECIAL_TEXTS; i++)		if (special_texts[i] == c) return i;	return -1;}intget_special_delimiter_index(c)	unsigned int	c;{	int	i;	for(i=0; i<NUM_SPECIAL_DELIMITERS; i++)		if (special_delimiters[i] == c) return i;	return -1;}#define process_spaces(skipc, skiplen)\{\	int	count2 = 0, count1 = 0, i;\\	count2 = (skiplen/2);\	count1 = (skiplen%2);\\	if (easysearch) {\		switch(skipc)\		{\		case ' ':\			if ((maxoutlen >= 0) && (outlen + count1 + count2 >= maxoutlen)) return outlen;\			if (outfp != NULL) {\				for (i=0; i<count2; i++) putc(TWOBLANKS, outfp);\				for (i=0; i<count1; i++) putc(BLANK, outfp);\				outlen += count1 + count2;\			}\			if (outbuf != NULL) {\				for (i=0; i<count2; i++) outbuf[outlen ++] = TWOBLANKS;\				for (i=0; i<count1; i++) outbuf[outlen ++] = BLANK;\			}\			break;\\		case '\t':\			if ((maxoutlen >= 0) && (outlen + count1 + count2 >= maxoutlen)) return outlen;\			if (outfp != NULL) {\				for (i=0; i<count2; i++) putc(TWOTABS, outfp);\				for (i=0; i<count1; i++) putc(TAB, outfp);\				outlen += count1 + count2;\			}\			if (outbuf != NULL) {\				for (i=0; i<count2; i++) outbuf[outlen ++] = TWOTABS;\				for (i=0; i<count1; i++) outbuf[outlen ++] = TAB;\			}\			break;\\		case '\n':\			if ((maxoutlen >= 0) && (outlen + count1 + count2*2 >= maxoutlen)) return outlen;\			if (outfp != NULL) {\				for (i=0; i<count2*2; i++) putc(NEWLINE, outfp);\				for (i=0; i<count1; i++) putc(NEWLINE, outfp);\				outlen += count1 + count2*2;\			}\			if (outbuf != NULL) {\				for (i=0; i<count2*2; i++) outbuf[outlen ++] = NEWLINE;\				for (i=0; i<count1; i++) outbuf[outlen ++] = NEWLINE;\			}\			break;\\		default: break;	/* cannot reach here */\		}\	}\	else {\		if ((maxoutlen >= 0) && (outlen + count1 + count2 >= maxoutlen)) return outlen;\		switch(skipc)\		{\		case ' ':\			if (outfp != NULL) {\				for (i=0; i<count2; i++) putc(TWOBLANKS, outfp);\				for (i=0; i<count1; i++) putc(BLANK, outfp);\				outlen += count1 + count2;\			}\			if (outbuf != NULL) {\				for (i=0; i<count2; i++) outbuf[outlen ++] = TWOBLANKS;\				for (i=0; i<count1; i++) outbuf[outlen ++] = BLANK;\			}\			break;\\		case '\t':\			if (outfp != NULL) {\				for (i=0; i<count2; i++) putc(TWOTABS, outfp);\				for (i=0; i<count1; i++) putc(TAB, outfp);\				outlen += count1 + count2;\			}\			if (outbuf != NULL) {\				for (i=0; i<count2; i++) outbuf[outlen ++] = TWOTABS;\				for (i=0; i<count1; i++) outbuf[outlen ++] = TAB;\			}\			break;\\		case '\n':\			if (outfp != NULL) {\				for (i=0; i<count2; i++) putc(TWONEWLINES, outfp);\				for (i=0; i<count1; i++) putc(NEWLINE, outfp);\				outlen += count1 + count2;\			}\			if (outbuf != NULL) {\				for (i=0; i<count2; i++) outbuf[outlen ++] = TWONEWLINES;\				for (i=0; i<count1; i++) outbuf[outlen ++] = NEWLINE;\			}\			break;\\		default: break;	/* cannot reach here */\		}\	}\}#define PRE_VERBATIM(v)\{\	if (!v) {\		v = 1;\		if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;\		if (outfp != NULL) putc(BEGIN_VERBATIM, outfp);\		if (outbuf != NULL) outbuf[outlen] = BEGIN_VERBATIM;\		outlen ++;\	}\}#define POST_VERBATIM(v) \{\	if (v) {\		v = 0;\		if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;\
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -