📄 sgrep.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
					}				}			}			else {				if(DNA) {					if (-1 == monkey4( pat, m, text+start, text+end, D , oldpat, oldm )) {						free_buf(fd, text);						return -1;					}				}				else {					if(m >= LONG_APPX) {						if (-1 == a_monkey(pat, m, text+start, text+end, D, oldpat, oldm)) {							free_buf(fd, text);							return -1;						}					}					else {						if (-1 == agrep(pat, m, text+start, text+end, D, oldpat, oldm)) {							free_buf(fd, text);							return -1;						}					}				}			}			if(FILENAMEONLY && (num_of_matched - prev_num_of_matched) && (NEW_FILE || !POST_FILTER)) {				if (agrep_finalfp != NULL)					fprintf(agrep_finalfp, "%s", CurrentFileName);				else {					int outindex;					for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&							(CurrentFileName[outindex] != '\0'); outindex++) {						agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex];					}					if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {						OUTPUT_OVERFLOW;						free_buf(fd, text);						return -1;					}					agrep_outpointer += outindex;				}				if (PRINTFILETIME) {					char *s = aprint_file_time(CurrentFileTime);					if (agrep_finalfp != NULL)						fprintf(agrep_finalfp, "%s", s);					else {						int outindex;						for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&								(s[outindex] != '\0'); outindex++) {							agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];						}						if ((s[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {							OUTPUT_OVERFLOW;							free_buf(fd, text);							return -1;						}						agrep_outpointer += outindex;					}				}				if (agrep_finalfp != NULL)					fprintf(agrep_finalfp, "\n");				else {					if (agrep_outpointer+1>=agrep_outlen) {						OUTPUT_OVERFLOW;						free_buf(fd, text);						return -1;					}					else agrep_outbuffer[agrep_outpointer++] = '\n';				}				free_buf(fd, text);				NEW_FILE = OFF;				return 0; 			}                }		free_buf(fd, text);		return 0;#if	AGREP_POINTER	}	else {	/* as if only one iteration of the while-loop and offset = 0 */		tempbuf = (CHARTYPE*)malloc(m);		text = (CHARTYPE *)agrep_inbuffer;		num_read = agrep_inlen;		start = 0;		buf_end = end = num_read - 1;#if	0		if (WHOLELINE) {			start --;			CurrentByteOffset --;		}#endif		if ((TCOMPRESSED == ON) && tuncompressible(text+1, num_read)) {			EASYSEARCH = text[offset+SIGNATURE_LEN-1];			start += SIGNATURE_LEN;			CurrentByteOffset += SIGNATURE_LEN;			if (!EASYSEARCH) {				fprintf(stderr, "not compressed for easy-search: can miss some matches in: %s\n", CurrentFileName);			}#if	MEASURE_TIMES			gettimeofday(&initt, NULL);#endif	/*MEASURE_TIMES*/			if (samepattern || ((newm = quick_tcompress(FREQ_FILE, HASH_FILE, pat, m, newpat, Max_record-8, EASYSEARCH)) > 0)) {				oldm = m;				oldpat = pat;				m = newm;				pat = newpat;			}#if	MEASURE_TIMES			gettimeofday(&finalt, NULL);			INFILTER_ms +=  (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);#endif	/*MEASURE_TIMES*/		}		else TCOMPRESSED = OFF;		PROCESS_PATTERN	/* must be after we know whether it is compressed or not */		memcpy(tempbuf, text+end+1, m);	/* save portion being overwritten */		for(i=1; i<=m; i++) text[end+i] = pat[m-1];		/* to make sure the skip loop in bm() won't go out of bound in later iterations */                        if (!DELIMITER)                                while(text[end]  != '\n' && end > 1) end--;                        else {                                unsigned char *newbuf = text + end + 1;                                newbuf = backward_delimiter(newbuf, text, D_pattern, D_length, OUTTAIL);        /* see agrep.c/'d' */				if (newbuf < text+offset+D_length) newbuf = text + end + 1;                                end = newbuf - text - 1;                        }                        /* text[0] = text[end] = r_newline; : the user must ensure that the delimiter is there at text[0] and occurs somewhere before text[end ] */			/* An exact copy of the above SGREP_PROCESS */			/* No harm in sending a few extra parameters even if they are unused: they are not accessed in monkey*()s */			if(D==0)  {				if(m > LONG_EXAC) {					if (-1 == monkey(pat, m, text+start, text+end, oldpat, oldm)) {						free_buf(fd, text);						memcpy(text+end+1, tempbuf, m); /* restore */						free(tempbuf);						return -1;					}				}				else {					if (-1 == bm(pat, m, text+start, text+end, oldpat, oldm)) {						free_buf(fd, text);						memcpy(text+end+1, tempbuf, m); /* restore */						free(tempbuf);						return -1;					}				}			}			else {				if(DNA) {					if (-1 == monkey4( pat, m, text+start, text+end, D , oldpat, oldm )) {						free_buf(fd, text);						memcpy(text+end+1, tempbuf, m); /* restore */						free(tempbuf);						return -1;					}				}				else {					if(m >= LONG_APPX) {						if (-1 == a_monkey(pat, m, text+start, text+end, D, oldpat, oldm)) {							free_buf(fd, text);							memcpy(text+end+1, tempbuf, m); /* restore */							free(tempbuf);							return -1;						}					}					else {						if (-1 == agrep(pat, m, text+start, text+end, D, oldpat, oldm)) {							free_buf(fd, text);							memcpy(text+end+1, tempbuf, m); /* restore */							free(tempbuf);							return -1;						}					}				}			}			if(FILENAMEONLY && (num_of_matched - prev_num_of_matched) && (NEW_FILE || !POST_FILTER)) {	/* externally set */				if (agrep_finalfp != NULL)					fprintf(agrep_finalfp, "%s", CurrentFileName);				else {					int outindex;					for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&							(CurrentFileName[outindex] != '\0'); outindex++) {						agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex];					}					if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {						OUTPUT_OVERFLOW;						free_buf(fd, text);						return -1;					}					agrep_outpointer += outindex;				}				if (PRINTFILETIME) {					char *s = aprint_file_time(CurrentFileTime);					if (agrep_finalfp != NULL)						fprintf(agrep_finalfp, "%s", s);					else {						int outindex;						for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&								(s[outindex] != '\0'); outindex++) {							agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];						}						if ((s[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {							OUTPUT_OVERFLOW;							free_buf(fd, text);							return -1;						}						agrep_outpointer += outindex;					}				}				if (agrep_finalfp != NULL)					fprintf(agrep_finalfp, "\n");				else {					if (agrep_outpointer+1>=agrep_outlen) {						OUTPUT_OVERFLOW;						free_buf(fd, text);						return -1;					}					else agrep_outbuffer[agrep_outpointer++] = '\n';				}				free_buf(fd, text);				NEW_FILE = OFF;			}		memcpy(text+end+1, tempbuf, m); /* restore */		free(tempbuf);		return 0;	}#endif	/*AGREP_POINTER*/} /* end sgrep *//* SUN: bm assumes that the content of text[n]...text[n+m-1] is pat[m-1] such that the skip loop is guaranteed to terminated */intbm(pat, m, text, textend, oldpat, oldm)CHARTYPE *text, *textend, *pat, *oldpat;int m, oldm;{	int PRINTED = 0;	register int shift;	register int  m1, j, d1; 	CHARTYPE *textbegin = text;	int newlen;	CHARTYPE *textstart;	CHARTYPE *curtextbegin;	CHARTYPE *curtextend;#if	MEASURE_TIMES	struct timeval initt, finalt;#endif	CHARTYPE *lastout = text;	d1 = shift_1;    /* at least 1 */	m1 = m - 1;	shift = 0;       	while (text <= textend) {		textstart = text;		shift = SHIFT[*(text += shift)];		while(shift) {         			shift = SHIFT[*(text += shift)];			shift = SHIFT[*(text += shift)];			shift = SHIFT[*(text += shift)];		}		CurrentByteOffset += text - textstart;		j = 0;		while(TR[pat[m1 - j]] == TR[*(text - j)]) {			if(++j == m)  break;       /* if statement can be saved, but for safty ... */		}		if (j == m ) { 			if(text > textend) return 0;			if(WORDBOUND) {				/* if(isalnum(*(unsigned char *)(text+1))) goto CONT; --> fixed by SHIOZAKI Takehiko <takehi-s@ascii.co.jp> */				if((text+1 <= textend) && isalnum(*(unsigned char *)(text+1)) && isalnum(*(unsigned char *)text)) {					shift = 1;	/* bg 4/27/97 */					goto WCONT;	/* as if there was no match */				}				/* if(isalnum(*(unsigned char *)(text-m))) goto CONT; --> fixed by SHIOZAKI Takehiko <takehi-s@ascii.co.jp> */				if((textbegin <= (text-m)) && isalnum(*(unsigned char *)(text-m)) && isalnum(*(unsigned char *)(text-m+1))) {					shift = 1;	/* bg 4/27/97 */					goto WCONT;	/* as if there was no match */				}				/* changed by Udi 11/7/94 to avoid having to set TR[] to W_delim */			}			if (TCOMPRESSED == ON) {				/* Don't update CurrentByteOffset here: only before outputting properly */				if (!DELIMITER) {					curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n'));					if (*curtextbegin == '\n') curtextbegin ++;					curtextend = curtextbegin; /*text-m*/; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++;					if (*curtextend == '\n') curtextend ++;				}				else {					curtextbegin = backward_delimiter(text, textbegin, tc_D_pattern, tc_D_length, OUTTAIL);					if (!OUTTAIL) {						curtextend = forward_delimiter(curtextbegin+D_length/*text-m*/, textend, tc_D_pattern, tc_D_length, OUTTAIL);					} else {						curtextend = forward_delimiter(curtextbegin/*text-m*/, textend, tc_D_pattern, tc_D_length, OUTTAIL);					}				}			}			else {				/* Don't update CurrentByteOffset here: only before outputting properly */				if (!DELIMITER) {					curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n'));					if (*curtextbegin == '\n') curtextbegin ++;					curtextend = curtextbegin /*text-m*/; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++;					if (*curtextend == '\n') curtextend ++;				}				else {					curtextbegin = backward_delimiter(text, textbegin, D_pattern, D_length, OUTTAIL);					if (!OUTTAIL) {						curtextend = forward_delimiter(curtextbegin+D_length/*text-m*/, textend, D_pattern, D_length, OUTTAIL);					} else {						curtextend = forward_delimiter(curtextbegin/*text-m*/, textend, D_pattern, D_length, OUTTAIL);					}				}			}			if (TCOMPRESSED == ON) {#if     MEASURE_TIMES                                gettimeofday(&initt, NULL);#endif  /*MEASURE_TIMES*/				if (-1 == exists_tcompressed_word(pat, m, curtextbegin, text - curtextbegin + m, EASYSEARCH))					goto CONT;	/* as if there was no match */#if     MEASURE_TIMES                                gettimeofday(&finalt, NULL);                                FILTERALGO_ms +=  (finalt.tv_sec *1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);#endif  /*MEASURE_TIMES*/			}			textbegin = curtextend; /* (curtextend - 1 > textbegin ? curtextend - 1 : curtextend); */			num_of_matched++;			if(FILENAMEONLY) return 0;			if(!COUNT) {				if (!INVERSE) {					if(FNAME && (NEW_FILE || !POST_FILTER)) {						char	nextchar = (POST_FILTER == ON)?'\n':' ';						char	*prevstring = (POST_FILTER == ON)?"\n":"";						if (agrep_finalfp != NULL)							fprintf(agrep_finalfp, "%s%s", prevstring, CurrentFileName);						else {							int outindex;							if (prevstring[0] != '\0') {								if(agrep_outpointer + 1 >= agrep_outlen) {									OUTPUT_OVERFLOW;									return -1;								}								else agrep_outbuffer[agrep_outpointer ++] = prevstring[0];							}							for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&									(CurrentFileName[outindex] != '\0'); outindex++) {								agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex];							}							if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {								OUTPUT_OVERFLOW;								return -1;							}							agrep_outpointer += outindex;						}						if (PRINTFILETIME) {							char *s = aprint_file_time(CurrentFileTime);							if (agrep_finalfp != NULL)								fprintf(agrep_finalfp, "%s", s);							else {								int outindex;								for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&										(s[outindex] != '\0'); outindex++) {									agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];								}								if ((s[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {									OUTPUT_OVERFLOW;									return -1;								}								agrep_outpointer += outindex;							}						}						if (agrep_finalfp != NULL)							fprintf(agrep_finalfp, ":%c", nextchar);						else {							if (agrep_outpointer+2>= agrep_outlen) {								OUTPUT_OVERFLOW;								return -1;							}							else {								agrep_outbuffer[agrep_outpointer++] = ':';								agrep_outbuffer[agrep_outpointer++] = nextchar;							}						}						NEW_FILE = OFF;						PRINTED = 1;					}					if(BYTECOUNT) {						if (agrep_finalfp != NULL)							fprintf(agrep_finalfp, "%d= ", CurrentByteOffset);						else {							char s[32];							int  outindex;							sprintf(s, "%d=", CurrentByteOffset);							for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&									(s[outindex] != '\0'); outindex++) {								agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];							}							if (s[outindex] != '\0') {								OUTPUT_OVERFLOW;								return -1;							}							agrep_outpointer += outindex;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -