⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 uncast.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
/* Copyright (c) 1994 Burra Gopal, Udi Manber.  All Rights Reserved. *//* * uncast.c:	main text uncompression routines. Exports tuncompress() called *		from main() in main_uncast.c, and one other simple routine *		tuncompressible_file(). */#include "defs.h"#include <sys/time.h>#if defined(__NeXT__)                                      /* NeXT has no <utime.h> */struct utimbuf {        time_t actime;          /* access time */        time_t modtime;         /* modification time */};#else#include <utime.h>#endif#define MYEOF	0xffffffffextern int RESERVED_CHARS;extern int MAX_WORDS;extern int SPECIAL_WORDS;extern int BEGIN_SPECIAL_WORDS;extern int END_SPECIAL_WORDS;extern int NUM_SPECIAL_DELIMITERS;extern int END_SPECIAL_DELIMITERS;extern int ONE_VERBATIM;extern int TC_FOUND_BLANK, TC_FOUND_NOTBLANK;extern char comp_signature[SIGNATURE_LEN];extern hash_entry freq_words_table[MAX_WORD_LEN+2][256];	/* 256 is the maximum possible number of special words */extern char freq_words_strings[256][MAX_WORD_LEN+2];extern int freq_words_lens[256];extern char *compress_string_table[DEF_MAX_WORDS]; /*[MAX_WORD_LEN+2]; */extern int usemalloc, next_free_strtable;initialize_tuncompress(string_file, freq_file, flags)	char	*string_file, *freq_file;	int	flags;{	FILE	*stringfp;	if (!initialize_common(freq_file, flags)) return 0;	next_free_strtable = 0;	memset(compress_string_table, '\0', sizeof(char *) * DEF_MAX_WORDS);	if (MAX_WORDS == 0) return 1;	/* Load uncompress dictionary */	if ((stringfp = fopen(string_file, "r")) == NULL) {		if (flags & TC_ERRORMSGS) {			fprintf(stderr, "cannot open cast-dictionary file: %s\n", string_file);			fprintf(stderr, "(use -H to give a dictionary-dir or run 'buildcast' to make a dictionary)\n");		}		return 0;	}	if (!build_string(compress_string_table, stringfp, -1, 0)) {	/* read all bytes until end */		fclose(stringfp);		return 0;	}	fclose(stringfp);	return 1;}uninitialize_tuncompress(){	int	i;	uninitialize_common();	if (usemalloc) {		for (i=0; i<MAX_WORDS; i++) {			if (compress_string_table[i] != NULL) free(compress_string_table[i]);		}	}	memset(compress_string_table, '\0', sizeof(char *) * DEF_MAX_WORDS);	next_free_strtable = 0;}extern int initialize_common_done;/* TRUE if file has the signature in its first 15 bytes, false otherwise */inttuncompressible(buffer, num_read)	char	*buffer;	int	num_read;{	char	*sig = comp_signature;	int	i;	if (!initialize_common_done) return 0;	if (num_read < SIGNATURE_LEN - 1) return 0;	for (i=0; i<SIGNATURE_LEN - 1; i++)		if (buffer[i] != sig[i]) return 0;	return 1;	/* a rewind is not done. hence this is useful even for stdin */}inttuncompressible_filename(name, len)	char	*name;	int	len;{	char	tempname[MAX_LINE_LEN];	if (!initialize_common_done) return 0;	special_get_name(name, len, tempname);	len = strlen(tempname);	if ((len < strlen(COMP_SUFFIX) + 1) || (strcmp(&tempname[len-strlen(COMP_SUFFIX)], COMP_SUFFIX))) return 0;	return 1;}inttuncompressible_file(name)	char	*name;{	char	buf[SIGNATURE_LEN + 2];	int	num;	FILE	*fp;	if (!initialize_common_done) return 0;	if (!tuncompressible_filename(name, strlen(name))) return 0;	if ((fp = my_fopen(name, "r")) == NULL) return 0;	num = fread(buf, 1, SIGNATURE_LEN - 1, fp);	fclose(fp);	return(tuncompressible(buf, num));}tuncompressible_fp(fp)	FILE	*fp;{	char	buf[SIGNATURE_LEN + 2];	int	num;	if (!initialize_common_done) return 0;	num = fread(buf, 1, SIGNATURE_LEN - 1, fp);	return(tuncompressible(buf, num));}/* defined in misc.c */extern char special_texts[];extern char special_delimiters[];#define process_special_char(c)\{\	if (outfp != NULL) {\		switch(c)\		{\		case TWOBLANKS:\			if ((maxoutlen >= 0) && (outlen + 2 >= maxoutlen)) return outlen;\			putc(' ', outfp);\			outlen ++;\			putc(' ', outfp);\			outlen ++;\			break;\		case BLANK:\			if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;\			putc(' ', outfp);\			outlen ++;\			break;\\		case TWOTABS:\			if ((maxoutlen >= 0) && (outlen + 2 >= maxoutlen)) return outlen;\			putc('\t', outfp);\			outlen ++;\			putc('\t', outfp);\			outlen ++;\			break;\		case TAB:\			if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;\			putc('\t', outfp);\			outlen ++;\			break;\\		case TWONEWLINES:\			if ((maxoutlen >= 0) && (outlen + 2 >= maxoutlen)) return outlen;\			putc('\n', outfp);\			outlen ++;\			putc('\n', outfp);\			outlen ++;\			break;\		case NEWLINE:\			if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;\			putc('\n', outfp);\			outlen ++;\			break;\\		default:\			if ((c < END_SPECIAL_TEXTS) && (c >= BEGIN_SPECIAL_TEXTS)) {\				if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;\				putc(special_texts[c - BEGIN_SPECIAL_TEXTS], outfp); outlen ++;\			}\			else if ((c < END_SPECIAL_DELIMITERS) && (c >= BEGIN_SPECIAL_DELIMITERS)) {\				if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;\				putc(special_delimiters[c - BEGIN_SPECIAL_DELIMITERS], outfp); outlen ++;\			}\			else if ((c < END_SPECIAL_WORDS) && (c >= BEGIN_SPECIAL_WORDS)) {\				if ((maxoutlen >= 0) && (outlen + freq_words_lens[c - BEGIN_SPECIAL_WORDS] >= maxoutlen)) return outlen;\				fprintf(outfp, "%s", freq_words_strings[c - BEGIN_SPECIAL_WORDS]); outlen += freq_words_lens[c - BEGIN_SPECIAL_WORDS];\			}\			/* else should not have called this function */\		}\	}\	if (outbuf != NULL) {\		switch(c)\		{\		case TWOBLANKS:\			if ((maxoutlen >= 0) && (outlen + 2 >= maxoutlen)) return outlen;\			outbuf[outlen ++] = ' ';\			outbuf[outlen ++] = ' ';\			break;\		case BLANK:\			if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;\			outbuf[outlen ++] = ' ';\			break;\\		case TWOTABS:\			if ((maxoutlen >= 0) && (outlen + 2 >= maxoutlen)) return outlen;\			outbuf[outlen ++] = '\t';\			outbuf[outlen ++] = '\t';\			break;\		case TAB:\			if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;\			outbuf[outlen ++] = '\t';\			break;\\		case TWONEWLINES:\			if ((maxoutlen >= 0) && (outlen + 2 >= maxoutlen)) return outlen;\			outbuf[outlen ++] = '\n';\			outbuf[outlen ++] = '\n';\			break;\		case NEWLINE:\			if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;\			outbuf[outlen ++] = '\n';\			break;\\		default:\			if ((c < END_SPECIAL_TEXTS) && (c >= BEGIN_SPECIAL_TEXTS)) {\				if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;\				outbuf[outlen ++] = special_texts[c - BEGIN_SPECIAL_TEXTS];\			}\			else if ((c < END_SPECIAL_DELIMITERS) && (c >= BEGIN_SPECIAL_DELIMITERS)) {\				if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;\				outbuf[outlen ++] = special_delimiters[c - BEGIN_SPECIAL_DELIMITERS];\			}\			else if ((c < END_SPECIAL_WORDS) && (c >= BEGIN_SPECIAL_WORDS)) {\				/* printf("-->%s\n", freq_words_strings[c-BEGIN_SPECIAL_WORDS]); */\				if ((maxoutlen >= 0) && (outlen + freq_words_lens[c - BEGIN_SPECIAL_WORDS] >= maxoutlen)) return outlen;\				memcpy(outbuf+outlen, freq_words_strings[c - BEGIN_SPECIAL_WORDS], freq_words_lens[c - BEGIN_SPECIAL_WORDS]);\				outlen += freq_words_lens[c - BEGIN_SPECIAL_WORDS]; \			}\			/* else should not have called this function */\		}\	}\}int UNCAST_ERRORS = 0;/* Uncompresses input from indata and outputs it into outdata: returns number of chars in output */inttuncompress(indata, maxinlen, outdata, maxoutlen, flags)	void	*indata, *outdata;	int	maxinlen, maxoutlen;	int	flags;{	unsigned short	index, dindex;	unsigned int	c;	int	verbatim_state = 0;	int	inlen, outlen = 0;	FILE	*infp = NULL, *outfp = NULL;	unsigned char	*inbuf = NULL, *outbuf = NULL;	int	easysearch = flags&TC_EASYSEARCH;	int	untilnewline = flags&TC_UNTILNEWLINE;	if (flags & TC_SILENT) return 0;	if (maxinlen < 0) {		infp = (FILE *)indata;		if ((easysearch = mygetc(infp, inbuf, maxinlen, &inlen)) == MYEOF) return outlen;	/* ignore parameter: take from file */		inlen = SIGNATURE_LEN;	}	else {	/* don't care about signature: user's responsibility */		inbuf = (unsigned char *)indata;		inlen = 0;	}	if (maxoutlen < 0) {		outfp = (FILE *)outdata;	}	else {		outbuf = (unsigned char *)outdata;	}	if (easysearch) {		ONE_VERBATIM = EASY_ONE_VERBATIM;		NUM_SPECIAL_DELIMITERS = EASY_NUM_SPECIAL_DELIMITERS;		END_SPECIAL_DELIMITERS = EASY_END_SPECIAL_DELIMITERS;	}	else {		ONE_VERBATIM = HARD_ONE_VERBATIM;		NUM_SPECIAL_DELIMITERS = HARD_NUM_SPECIAL_DELIMITERS;		END_SPECIAL_DELIMITERS = HARD_END_SPECIAL_DELIMITERS;	}	if (TC_FOUND_BLANK) {		if (outfp != NULL) putc(' ', outfp);		if (outbuf != NULL) outbuf[outlen] = ' ';		outlen ++;	}	TC_FOUND_BLANK = 0;	/* default: use result of previous backward_tcompressed_word only */	/*	 * The algorithm, as expected, is a complete inverse of the compression	 * algorithm: see tcompress.c in this directory to understand this function.	 * I've used gotos since the termination condition is too complex.	 * The two sub-parts are exactly the same except for verbatim processing.

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -