⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 sgrep.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
	CHARTYPE *curtextbegin;	CHARTYPE *curtextend;#if	MEASURE_TIMES	struct timeval initt, finalt;#endif	if(SILENT) return 0;	if (TCOMPRESSED == ON) {		if (!DELIMITER) {			curtextbegin = text + *i; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n'));			if (*curtextbegin == '\n') curtextbegin ++;			curtextend = curtextbegin /*text -m + *i*/ /* + 1 agrep() has i++ */; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++;			if (*curtextend == '\n') curtextend ++;		}		else {			curtextbegin = backward_delimiter(text + *i, text, tc_D_pattern, tc_D_length, OUTTAIL);			curtextend = forward_delimiter(curtextbegin /*text -m + *i*/ /* + 1 agrep() has i++ */, textend, tc_D_pattern, tc_D_length, OUTTAIL);		}	}	else {		if (!DELIMITER) {			curtextbegin = text + *i; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n'));			if (*curtextbegin == '\n') curtextbegin ++;			curtextend = curtextbegin /*text -m + *i*/ /* + 1 agrep() has i++ */; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++;			if (*curtextend == '\n') curtextend ++;		}		else {			curtextbegin = backward_delimiter(text + *i, text, D_pattern, D_length, OUTTAIL);			curtextend = forward_delimiter(curtextbegin /*text -m + *i*/ /* + 1 agrep() has i++ */, textend, D_pattern, D_length, OUTTAIL);		}	}	if (TCOMPRESSED == ON) {#if     MEASURE_TIMES		gettimeofday(&initt, NULL);#endif  /*MEASURE_TIMES*/		if (-1 == exists_tcompressed_word(pat, m, curtextbegin, text  + *i - curtextbegin + m, EASYSEARCH)) {			num_of_matched --;			return 0;		}#if     MEASURE_TIMES		gettimeofday(&finalt, NULL);		FILTERALGO_ms +=  (finalt.tv_sec *1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);#endif  /*MEASURE_TIMES*/	}	textbegin = curtextend; /*(curtextend - 1 > textbegin ? curtextend - 1 : curtextend); */	oldi = *i;	*i += textbegin - (text + *i);	if(COUNT) return 0;	if (INVERSE) {		if (TCOMPRESSED == ON) { /* INVERSE: Don't care about filtering time */			if (agrep_finalfp != NULL)				newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, *lastout, curtextbegin - *lastout, agrep_finalfp, -1, EASYSEARCH);			else {				if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, *lastout, curtextbegin - *lastout, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {					if (newlen + agrep_outpointer >= agrep_outlen) {						OUTPUT_OVERFLOW;						return -1;					}					agrep_outpointer += newlen;				}			}			*lastout=textbegin;			CurrentByteOffset += textbegin - text;			text = textbegin;		}		else { /* NOT TCOMPRESSED */			if (agrep_finalfp != NULL)				fwrite(*lastout, 1, curtextbegin-*lastout, agrep_finalfp);			else {				if (curtextbegin - *lastout + agrep_outpointer >= agrep_outlen) {					OUTPUT_OVERFLOW;					return -1;				}				memcpy(agrep_outbuffer+agrep_outpointer, *lastout, curtextbegin-*lastout);				agrep_outpointer += (curtextbegin - *lastout);			}			*lastout=textbegin;			CurrentByteOffset += textbegin - text;			text = textbegin;		} /* TCOMPRESSED */		return 0;	}	if(FNAME && (NEW_FILE || !POST_FILTER)) {		char	nextchar = (POST_FILTER == ON)?'\n':' ';		char	*prevstring = (POST_FILTER == ON)?"\n":"";		if (agrep_finalfp != NULL)			fprintf(agrep_finalfp, "%s%s", prevstring, CurrentFileName);		else {			int outindex;			if (prevstring[0] != '\0') {				if(agrep_outpointer + 1 >= agrep_outlen) {					OUTPUT_OVERFLOW;					return -1;				}				else agrep_outbuffer[agrep_outpointer ++] = prevstring[0];			}			for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&					(CurrentFileName[outindex] != '\0'); outindex++) {				agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex];			}			if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {				OUTPUT_OVERFLOW;				return -1;			}			agrep_outpointer += outindex;		}		if (PRINTFILETIME) {			char *s = aprint_file_time(CurrentFileTime);			if (agrep_finalfp != NULL)				fprintf(agrep_finalfp, "%s", s);			else {				int outindex;				for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&						(s[outindex] != '\0'); outindex++) {					agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];				}				if ((s[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {					OUTPUT_OVERFLOW;					return -1;				}				agrep_outpointer += outindex;			}		}		if (agrep_finalfp != NULL)			fprintf(agrep_finalfp, ":%c", nextchar);		else {			if (agrep_outpointer+2>= agrep_outlen) {				OUTPUT_OVERFLOW;				return -1;			}			else {				agrep_outbuffer[agrep_outpointer++] = ':';				agrep_outbuffer[agrep_outpointer++] = nextchar;			}		}		NEW_FILE = OFF;		PRINTED = 1;	}	if(BYTECOUNT) {		if (agrep_finalfp != NULL)			fprintf(agrep_finalfp, "%d= ", CurrentByteOffset);		else {			char s[32];			int  outindex;			sprintf(s, "%d= ", CurrentByteOffset);			for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&					(s[outindex] != '\0'); outindex++) {				agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];			}			if (s[outindex] != '\0') {				OUTPUT_OVERFLOW;				return -1;			}			agrep_outpointer += outindex;		}		PRINTED = 1;	}	if (PRINTOFFSET) {		if (agrep_finalfp != NULL)			fprintf(agrep_finalfp, "@%d{%d} ", CurrentByteOffset - (text + oldi-curtextbegin), curtextend-curtextbegin);		else {			char s[32];			int outindex;			sprintf(s, "@%d{%d} ", CurrentByteOffset - (text + oldi-curtextbegin), curtextend-curtextbegin);			for (outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&					 (s[outindex] != '\0'); outindex ++) {				agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];			}			if (s[outindex] != '\0') {				OUTPUT_OVERFLOW;				return -1;			}			agrep_outpointer += outindex;		}		PRINTED = 1;	}	if (PRINTRECORD) {	if (TCOMPRESSED == ON) {#if	MEASURE_TIMES		gettimeofday(&initt, NULL);#endif	/*MEASURE_TIMES*/		if (agrep_finalfp != NULL) {			newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, curtextbegin, curtextend-curtextbegin, agrep_finalfp, -1, EASYSEARCH);		}		else {			if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, curtextbegin, curtextend-curtextbegin, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {				if (agrep_outpointer + newlen + 1 >= agrep_outlen) {					OUTPUT_OVERFLOW;					return -1;				}				agrep_outpointer += newlen;			}		}#if	MEASURE_TIMES		gettimeofday(&finalt, NULL);		OUTFILTER_ms +=  (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);#endif	/*MEASURE_TIMES*/	}	else {		if (agrep_finalfp != NULL) {			fwrite(curtextbegin, 1, curtextend - curtextbegin, agrep_finalfp);		}		else {			if (agrep_outpointer + curtextend - curtextbegin >= agrep_outlen) {				OUTPUT_OVERFLOW;				return -1;			}			memcpy(agrep_outbuffer + agrep_outpointer, curtextbegin, curtextend - curtextbegin);			agrep_outpointer += curtextend - curtextbegin;		}	}	}	else if (PRINTED) {		if (agrep_finalfp != NULL) fputc('\n', agrep_finalfp);		else agrep_outbuffer[agrep_outpointer ++] = '\n';		PRINTED = 0;	}	return 0;}static voidprep_bm(Pattern, m)      unsigned char *Pattern;register m;{	int i;	unsigned hash;	unsigned char lastc;	for (i = 0; i < MAXSYM; i++) SHIFT[i] = m;	for (i = m-1; i>=0; i--) {		hash = TR[Pattern[i]];		if((int)(SHIFT[hash]) >= (int)(m - 1)) SHIFT[hash] = m-1-i;	}	shift_1 = m-1;	/* shift_1 records the previous occurrence of the last character of	the pattern.  When we match this last character but do not have a match,	we can shift until we reach the next occurrence from the right. */	lastc = TR[Pattern[m-1]];	for (i= m-2; i>=0; i--) {		if(TR[Pattern[i]] == lastc )		{ 			shift_1 = m-1 - i;  			i = -1; 		}	}	if(shift_1 == 0) shift_1 = 1; /* can never happen - Udi 11/7/94 */	if(NOUPPER) for(i=0; i<MAXSYM; i++) {		if (isupper(i)) SHIFT[i] = SHIFT[tolower(i)];		/* SHIFT[i] = SHIFT[i +  'a' - 'A']; */	}#ifdef DEBUG	for(i='a'; i<='z'; i++) printf("%c: %d", i, SHIFT[i]); 	printf("\n");	for(i='A'; i<='Z'; i++) printf("%c: %d", i, SHIFT[i]); 	printf("\n");#endif}/* monkey uses two characters for delta_1 shifting */CHARTYPE SHIFT_2[MAX_SHIFT_2];intmonkey( pat, m, text, textend  ) register int m  ; register CHARTYPE *text, *textend, *pat;{	int PRINTED = 0;	register unsigned hash;	register CHARTYPE shift;	register int  m1, j; 	CHARTYPE *textbegin = text;	CHARTYPE *textstart;	int newlen;	CHARTYPE *curtextbegin;	CHARTYPE *curtextend;#if	MEASURE_TIMES	struct timeval initt, finalt;#endif	CHARTYPE *lastout = text;	m1 = m - 1;	text = text+m1;	CurrentByteOffset += m1;	while (text < textend) {		textstart = text;		hash = TR[*text];		hash = (hash << 3) + TR[*(text-1)];		shift = SHIFT_2[hash];		while(shift) {			text = text + shift;			hash = (TR[*text] << 3) + TR[*(text-1)];			shift = SHIFT_2[hash];		}		CurrentByteOffset += text - textstart;		j = 0;		while(TR[pat[m1 - j]] == TR[*(text - j)]) { 			if(++j == m) break; 		}		if (j == m ) {			if(text > textend) return 0; /* Udi: used to be >= for some reason */			/* added by Udi 11/7/94 */			if(WORDBOUND) {				/* if(isalnum(*(unsigned char *)(text+1))) goto CONT; --> fixed by SHIOZAKI Takehiko <takehi-s@ascii.co.jp> */				if((text+1 <= textend) && isalnum(*(unsigned char *)(text+1)) && isalnum(*(unsigned char *)text)) {					goto CONT;	/* as if there was no match */				}				/* if(isalnum(*(unsigned char *)(text-m))) goto CONT; --> fixed by SHIOZAKI Takehiko <takehi-s@ascii.co.jp> */				if((textbegin <= (text-m)) && isalnum(*(unsigned char *)(text-m)) && isalnum(*(unsigned char *)(text-m+1))) {					goto CONT;	/* as if there was no match */				}				/* changed by Udi 11/7/94 to avoid having to set TR[] to W_delim */			}			if (TCOMPRESSED == ON) {				/* Don't update CurrentByteOffset here: only before outputting properly */				if (!DELIMITER) {					curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n'));					if (*curtextbegin == '\n') curtextbegin ++;					curtextend = curtextbegin /*text-m*/; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++;					if (*curtextend == '\n') curtextend ++;				}				else {					curtextbegin = backward_delimiter(text, textbegin, tc_D_pattern, tc_D_length, OUTTAIL);					curtextend = forward_delimiter(curtextbegin /*text -m*/, textend, tc_D_pattern, tc_D_length, OUTTAIL);				}			}			else {				/* Don't update CurrentByteOffset here: only before outputting properly */				if (!DELIMITER) {					curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n'));					if (*curtextbegin == '\n') curtextbegin ++;					curtextend = curtextbegin /*text-m*/; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++;					if (*curtextend == '\n') curtextend ++;				}				else {					curtextbegin = backward_delimiter(text, textbegin, D_pattern, D_length, OUTTAIL);					curtextend = forward_delimiter(curtextbegin/*text -m*/, textend, D_pattern, D_length, OUTTAIL);				}			}			if (TCOMPRESSED == ON) {#if     MEASURE_TIMES                                gettimeofday(&initt, NULL);#endif  /*MEASURE_TIMES*/				if (-1 == exists_tcompressed_word(pat, m, curtextbegin, text - curtextbegin + m, EASYSEARCH))					goto CONT;	/* as if there was no match */#if     MEASURE_TIMES                                gettimeofday(&finalt, NULL);                                FILTERALGO_ms +=  (finalt.tv_sec *1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);#endif  /*MEASURE_TIMES*/			}			textbegin = curtextend; /*(curtextend - 1 > textbegin ? curtextend - 1 : curtextend); */			num_of_matched++;			if(FILENAMEONLY)  return 0;			if (!COUNT) {				if (!INVERSE) {					if(FNAME && (NEW_FILE || !POST_FILTER)) {						char	nextchar = (POST_FILTER == ON)?'\n':' ';						char	*prevstring = (POST_FILTER == ON)?"\n":"";						if (agrep_finalfp != NULL)							fprintf(agrep_finalfp, "%s%s", prevstring, CurrentFileName);						else {							int outindex;							if (prevstring[0] != '\0') {								if(agrep_outpointer + 1 >= agrep_outlen) {									OUTPUT_OVERFLOW;									return -1;								}								else agrep_outbuffer[agrep_outpointer ++] = prevstring[0];							}							for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&									(CurrentFileName[outindex] != '\0'); outindex++) {								agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex];							}							if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {								OUTPUT_OVERFLOW;								return -1;							}							agrep_outpointer += outindex;						}						if (PRINTFILETIME) {							char *s = aprint_file_time(CurrentFileTime);							if (agrep_finalfp != NULL)								fprintf(agrep_finalfp, "%s", s);							else {								int outindex;								for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&										(s[outindex] != '\0'); outindex++) {									agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];								}								if ((s[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {									OUTPUT_OVERFLOW;									return -1;								}								agrep_outpointer += outindex;							}						}						if (agrep_finalfp != NULL)							fprintf(agrep_finalfp, ":%c", nextchar);						else {							if (agrep_outpointer+2>= agrep_outlen) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -