📄 cast.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
		if (outfp != NULL) putc(END_VERBATIM, outfp);\		if (outbuf != NULL) outbuf[outlen] = END_VERBATIM;\		outlen ++;\	}\}#define EASY_PRE_VERBATIM(v) \{\	if (easysearch) {\		if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;\		if (outfp != NULL) putc(ONE_VERBATIM, outfp);\		if (outbuf != NULL) outbuf[outlen] = ONE_VERBATIM;\		outlen ++;\	}\	else {\		PRE_VERBATIM(v)\	}\}#define EASY_POST_VERBATIM(v) \{\	if (easysearch) {\		POST_VERBATIM(v)\	}\	/* else ignore */\}intget_special_word_index(word, len)	char	word[MAX_NAME_LEN];	int	len;{	register int	comp;	hash_entry	*e;	if ((len > MAX_WORD_LEN) || (SPECIAL_WORDS <= 0)) return -1;	e = freq_words_table[len];	while((e != NULL) && (e->val.offset != -1)) {		comp = strcmp(word, e->word);		if (comp == 0) return e->val.offset;		if (comp < 0) return -1;	/* can't find it anyway */		e = e->next;	}	return -1;}/* Compresses input from indata and outputs it into outdata: returns number of chars in output */inttcompress(indata, maxinlen, outdata, maxoutlen, flags)	void	*indata, *outdata;	int	maxinlen, maxoutlen;	int	flags;{	unsigned char	curword[MAX_NAME_LEN];	int	curlen;	int	hashindex;	hash_entry *e;	unsigned int	c;	unsigned short	encodedindex;	int	skiplen;	int	ret;	int	verbatim_state = 0;	char	*sig = comp_signature;	FILE	*infp = NULL, *outfp = NULL;	unsigned char	*inbuf = NULL, *outbuf = NULL;	int	outlen = 0, inlen = 0;	int	easysearch = flags&TC_EASYSEARCH;	int	untilnewline = flags&TC_UNTILNEWLINE;	if (flags & TC_SILENT) return 0;	if (easysearch) {		ONE_VERBATIM = EASY_ONE_VERBATIM;		NUM_SPECIAL_DELIMITERS = EASY_NUM_SPECIAL_DELIMITERS;		END_SPECIAL_DELIMITERS = EASY_END_SPECIAL_DELIMITERS;	}	else {		ONE_VERBATIM = HARD_ONE_VERBATIM;		NUM_SPECIAL_DELIMITERS = HARD_NUM_SPECIAL_DELIMITERS;		END_SPECIAL_DELIMITERS = HARD_END_SPECIAL_DELIMITERS;	}	if (maxinlen < 0) {		infp = (FILE *)indata;	}	else {		inbuf = (unsigned char *)indata;	}	if (maxoutlen < 0) {		outfp = (FILE *)outdata;	}	else {		outbuf = (unsigned char *)outdata;	}	/* Write signature and information about whether compression was context-free or not: first 16 bytes */	if (outfp != NULL) {		if ((maxoutlen >= 0) && (outlen + SIGNATURE_LEN >= maxoutlen)) return outlen;		if (0 == fwrite(sig, 1, SIGNATURE_LEN - 1, outfp)) return 0;		if (easysearch) putc(1, outfp);		else putc(0, outfp);		outlen += SIGNATURE_LEN;	}	/* No need to put a signature OR easysearch when doing it in memory: caller must manipulate */	/*	 * The algorithm for compression is as follows:	 *	 * For each input word, we search and see if it is in the dictionary.	 * If it IS there, we just look at its word-index and output it.	 * Then, if the character immediately after the word is NOT a blank,	 * we output a second character indicating what it was.	 *	 * If it is not in the dictionary then we output it verbatim: for	 * verbatim o/p, we take care to merge consecutive verbatim outputs	 * by NOT putting delimiters between them (one start and one end	 * delimiter).	 *	 * If the input is not a word but a single character, then it can be:	 * 1. A special character, in which case we output its code.	 * 2. A blank character in which case we keep getting more characters	 *    to see howmany blanks we get. At the first non blank character,	 *    we output a sequence of special characters which encode multiple	 *    blanks (note: blanks can be spaces, tabs or newlines).	 *	 * Please refer to the state diagram for explanations.	 * I've used gotos since the termination condition is too complex.	 */real_tgetword:	curlen = 0;	curword[0] = '\0';concocted_tgetword:	c = tgetword(infp, inbuf, maxinlen, &inlen, curword, &curlen);bypass_tgetword:	if (curlen == 0) { /* only one character read and that is in c. */		switch(c)		{		case ' ':		case '\t':		case '\n':			POST_VERBATIM(verbatim_state);	/* need post-verbatim since there might be a LOT of blanks, etc. */			ret = skip(infp, inbuf, maxinlen, &inlen, c, &skiplen);			process_spaces(c, skiplen);			if ((c == '\n') && untilnewline) return outlen;			if (isalnum((unsigned char)ret)) {				curword[0] = (unsigned char)ret;				curword[1] = '\0';				curlen = 1;				goto concocted_tgetword;			}			else if (ret != MYEOF) {				c = (unsigned int)ret;				goto bypass_tgetword;			}			/* else fall thru */		case MYEOF: return outlen;		default:			if ((ret = get_special_text_index(c)) != -1) {				if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;				if (verbatim_state) {	/* no need to do post-verbatim since only one character: optimization */					if (outfp != NULL) putc(c, outfp);					if (outbuf != NULL) outbuf[outlen] = c;					outlen ++;				}				else {					if (outfp != NULL) putc(ret + BEGIN_SPECIAL_TEXTS, outfp);					if (outbuf != NULL) outbuf[outlen] = ret + BEGIN_SPECIAL_TEXTS;					outlen ++;				}			}			else {				/*				 * Has to be verbatim character: they have a ONE_VERBATIM before each				 * irrespective of verbatim_state. Otherwise there is no way to differentiate				 * one of our special characters from the same characters appearing in the				 * source. Hence binary files blow-up to twice their original size.				 * 				 * Also, if it is a verbatim character that cannot be confused with one of OUR				 * special characters, then just put it in w/o changing verbatim state. Else				 * put a begin-verbatim before it and THEN output that character=saves 1 char.				 */				if ((c != BEGIN_VERBATIM) && (c != END_VERBATIM)) {	/* reduces to below if easysearch */					EASY_PRE_VERBATIM(verbatim_state)					if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;					if (outfp != NULL) putc(c, outfp);					if (outbuf != NULL) outbuf[outlen] = c;					outlen ++;				}				else {	/* like \ escape in C: \ is \\ */					if ((maxoutlen >= 0) && (outlen + 2 >= maxoutlen)) return outlen;					if (outfp != NULL) putc(ONE_VERBATIM, outfp);					if (outbuf != NULL) outbuf[outlen] = ONE_VERBATIM;					outlen ++;					if (outfp != NULL) putc(c, outfp);					if (outbuf != NULL) outbuf[outlen] = c;					outlen ++;				}			}			goto real_tgetword;		}	}	else	/* curlen >= 1 */	{		if (!easysearch && verbatim_state && (curlen <= 2)) {			fprintf(outfp, "%s", curword);	/* don't bother to close the verbatim state and put a 2byte index=saves 1 char */			curword[0] = '\0';			curlen = 0;			goto bypass_tgetword;		}		else		{			if ((ret = get_special_word_index(curword, curlen)) != -1) {				POST_VERBATIM(verbatim_state);				/* printf("ret=%d word=%s\n", ret, curword); */				if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;				if (outfp != NULL) putc(ret + BEGIN_SPECIAL_WORDS, outfp);				if (outbuf != NULL) outbuf[outlen] = ret + BEGIN_SPECIAL_WORDS;				outlen ++;			}			else if ((e = get_hash(compress_hash_table, curword, curlen, &hashindex)) != NULL) {#if	0				fprintf(stderr, "%x ", e->val.attribute.index);#endif	/*0*/				encodedindex = encode_index(e->val.attribute.index);				POST_VERBATIM(verbatim_state);				if ((maxoutlen >= 0) && (outlen + sizeof(short) >= maxoutlen)) return outlen;				if (outfp != NULL) {					putc(((encodedindex & 0xff00)>>8), outfp);					putc((encodedindex & 0x00ff), outfp);				}				if (outbuf != NULL) {					outbuf[outlen] = ((encodedindex & 0xff00)>>8);					outbuf[outlen + 1] = encodedindex & 0x00ff;				}				outlen += sizeof(short);			}			else goto NOT_IN_DICTIONARY;		/* process_char_after_word: */			switch(c)			{			case ' ':				goto real_tgetword;	/* blank is a part of the word */			case MYEOF:				if (easysearch) return outlen;				if (outfp != NULL) putc(NOTBLANK, outfp);				if (outbuf != NULL) outbuf[outlen] = NOTBLANK;				outlen ++;				return outlen;			default:				if ((maxoutlen >= 0) && (outlen + 1 >= maxoutlen)) return outlen;				if ((ret = get_special_delimiter_index(c)) != -1) {					if (outfp != NULL) putc((ret+BEGIN_SPECIAL_DELIMITERS), outfp);					if (outbuf != NULL) outbuf[outlen] = ret + BEGIN_SPECIAL_DELIMITERS;					outlen ++;					goto real_tgetword;				}				else {					if (outfp != NULL) putc(NOTBLANK, outfp);					if (outbuf != NULL) outbuf[outlen] = NOTBLANK;					outlen ++;					if (!isalnum(c)) {						curword[0] = '\0';						curlen = 0;						goto bypass_tgetword;					}					else {	/* might be a number which ended with an alphabet: ".. born in 1992AD" */						curword[0] = c;						curword[1] = '\0';						curlen = 1;						goto concocted_tgetword;					}				}			}		}	NOT_IN_DICTIONARY: /* word not in dictionary */		PRE_VERBATIM(verbatim_state);		if ((maxoutlen >= 0) && (outlen + curlen >= maxoutlen)) return outlen;		if ((outfp != NULL) && (0 == fwrite(curword, sizeof(char), curlen, outfp))) return 0;		if (outbuf != NULL) memcpy(outbuf+outlen, curword, curlen);		outlen += curlen;		EASY_POST_VERBATIM(verbatim_state);		switch(c)		{		case MYEOF: /* Prefix searches still work since our scheme is context free */ return outlen;		default:			if (!isalnum(c)) {				curword[0] = '\0';				curlen = 0;				goto bypass_tgetword;			}			else {	/* might be a number which ended with an alphabet: ".. born in 1992AD" */				curword[0] = c;				curword[1] = '\0';				curlen = 1;				goto concocted_tgetword;			}		}	}}#define FUNCTION	tcompress_file#define DIRECTORY	tcompress_directory#include "trecursive.c"/* returns #bytes (>=0) in the compressed file, -1 if major error (not able to compress) */tcompress_file(name, outname, flags)	char	*name, *outname;	int	flags;{	FILE	*fp;	FILE	*outfp;	int	inlen, ret;	struct stat statbuf;	/* struct timeval tvp[2]; */	struct utimbuf tvp;	char	tempname[MAX_LINE_LEN];	if (name == NULL) return -1;	special_get_name(name, -1, tempname);	inlen = strlen(tempname);	if (-1 == stat(tempname, &statbuf)) {		if (flags & TC_ERRORMSGS)			fprintf(stderr, "permission denied or non-existent: %s\n", tempname);		return -1;	}	if (S_ISDIR(statbuf.st_mode)) {		if (flags & TC_RECURSIVE) return tcompress_directory(tempname, outname, flags);		if (flags & TC_ERRORMSGS)			fprintf(stderr, "skipping directory: %s\n", tempname);		return -1;	}	if (!S_ISREG(statbuf.st_mode)) {		if (flags & TC_ERRORMSGS)			fprintf(stderr, "not a regular file, skipping: %s\n", tempname);		return -1;	}	if ((fp = fopen(tempname, "r")) == NULL) {		if (flags & TC_ERRORMSGS)			fprintf(stderr, "permission denied or non-existent: %s\n", tempname);		return -1;	}	if (!tcompressible_fp(fp, flags)) {		if (flags & TC_ERRORMSGS)			fprintf(stderr, " skipping: %s\n", tempname);		fclose(fp);		return -1;	}	rewind(fp);	if (flags & TC_SILENT) {		printf("%s\n", tempname);		fclose(fp);		return 0;	}	/* Create and open output file */	strncpy(outname, tempname, MAX_LINE_LEN);	if (inlen + strlen(COMP_SUFFIX) + 1 >= MAX_LINE_LEN) {		outname[MAX_LINE_LEN - strlen(COMP_SUFFIX)] = '\0';		fprintf(stderr, "very long file name %s: truncating to: %s", tempname, outname);	}	strcat(outname, COMP_SUFFIX);	if (!access(outname, R_OK)) {	/* output file exists */		if (!(flags & TC_OVERWRITE)) {			fclose(fp);			return 0;		}		else if (!(flags & TC_NOPROMPT)) {			char	s[8];			printf("overwrite %s? (y/n): ", outname);			scanf("%c", s);			if (s[0] != 'y') {				fclose(fp);				return 0;			}		}	}	if ((outfp = fopen(outname, "w")) == NULL) {		fprintf(stderr, "cannot open for writing: %s\n", outname);		fclose(fp);		return -1;	}	ret = tcompress(fp, -1, outfp, -1, flags);	if ((statbuf.st_size * (100 - COMP_ATLEAST))/100 < ret) {		fprintf(stderr, "less than %d%% compression, skipping: %s\n", COMP_ATLEAST, tempname);		fclose(fp);		rewind(outfp);		fclose(outfp);		unlink(outname);		return ret;	}	if ((ret > 0) && (flags & TC_REMOVE)) unlink(tempname);	fclose(fp);	fflush(outfp);	fclose(outfp);	/*	tvp[0].tv_sec = statbuf.st_atime;	tvp[0].tv_usec = 0;	tvp[1].tv_sec = statbuf.st_mtime;	tvp[1].tv_usec = 0;	utimes(outname, tvp);	*/	tvp.actime = statbuf.st_atime;	tvp.modtime = statbuf.st_mtime;	utime(outname, &tvp);	return ret;}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -