⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 sgrep.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
						}						PRINTED = 1;					}					if (PRINTOFFSET) {						if (agrep_finalfp != NULL)							fprintf(agrep_finalfp, "@%d{%d} ", CurrentByteOffset - (text -curtextbegin), curtextend-curtextbegin);						else {							char s[32];							int outindex;							sprintf(s, "@%d{%d} ", CurrentByteOffset - (text -curtextbegin), curtextend-curtextbegin);							for (outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&									 (s[outindex] != '\0'); outindex ++) {								agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];							}							if (s[outindex] != '\0') {								OUTPUT_OVERFLOW;								return -1;							}							agrep_outpointer += outindex;						}						PRINTED = 1;					}					CurrentByteOffset += textbegin - text;					text = textbegin;					if (PRINTRECORD) {					if (TCOMPRESSED == ON) {#if	MEASURE_TIMES						gettimeofday(&initt, NULL);#endif	/*MEASURE_TIMES*/						if (agrep_finalfp != NULL)							newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, curtextbegin, curtextend-curtextbegin, agrep_finalfp, -1, EASYSEARCH);						else {							if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, curtextbegin, curtextend-curtextbegin, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {								if (agrep_outpointer + newlen + 1 >= agrep_outlen) {									OUTPUT_OVERFLOW;									return -1;								}								agrep_outpointer += newlen;							}						}#if	MEASURE_TIMES						gettimeofday(&finalt, NULL);						OUTFILTER_ms +=  (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);#endif	/*MEASURE_TIMES*/					}					else {						if (agrep_finalfp != NULL) {							fwrite(curtextbegin, 1, curtextend - curtextbegin, agrep_finalfp);						}						else {							if (agrep_outpointer + curtextend - curtextbegin >= agrep_outlen) {								OUTPUT_OVERFLOW;								return -1;							}							memcpy(agrep_outbuffer+agrep_outpointer, curtextbegin, curtextend-curtextbegin);							agrep_outpointer += curtextend - curtextbegin;						}					}					}					else if (PRINTED) {						if (agrep_finalfp != NULL) fputc('\n', agrep_finalfp);						else agrep_outbuffer[agrep_outpointer ++] = '\n';						PRINTED = 0;					}				}				else {	/* INVERSE */					if (!SILENT) {					if (TCOMPRESSED == ON) { /* INVERSE: Don't care about filtering time */						if (agrep_finalfp != NULL)							newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_finalfp, -1, EASYSEARCH);						else {							if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {								if (newlen + agrep_outpointer >= agrep_outlen) {									OUTPUT_OVERFLOW;									return -1;								}								agrep_outpointer += newlen;							}						}						lastout=textbegin;						CurrentByteOffset += textbegin - text;						text = textbegin;					}					else { /* NOT TCOMPRESSED */						if (agrep_finalfp != NULL)							fwrite(lastout, 1, curtextbegin-lastout, agrep_finalfp);						else {							if (curtextbegin - lastout + agrep_outpointer >= agrep_outlen) {								OUTPUT_OVERFLOW;								return -1;							}							memcpy(agrep_outbuffer+agrep_outpointer, lastout, curtextbegin-lastout);							agrep_outpointer += (curtextbegin - lastout);						}						lastout=textbegin;						CurrentByteOffset += textbegin - text;						text = textbegin;					} /* TCOMPRESSED */					} /* !SILENT */				} /* INVERSE */			}			else {	/* COUNT */				CurrentByteOffset += textbegin - text;				text = textbegin;			}			if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) ||			    ((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) return 0;	/* done */CONT:			if (m == 1) shift = 0; else shift = 1;	/* ZZZZZZZZZZZZZZZZ check it out later */		}		else shift = d1;WCONT:	;	}	if (!SILENT && INVERSE && !COUNT && (lastout <= textend)) {		if (TCOMPRESSED == ON) { /* INVERSE: Don't care about filtering time */			if (agrep_finalfp != NULL)				newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_finalfp, -1, EASYSEARCH);			else {				if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {					if (newlen + agrep_outpointer >= agrep_outlen) {						OUTPUT_OVERFLOW;						return -1;					}					agrep_outpointer += newlen;				}			}		}		else { /* NOT TCOMPRESSED */			if (agrep_finalfp != NULL)				fwrite(lastout, 1, textend-lastout + 1, agrep_finalfp);			else {				if (textend - lastout + 1 + agrep_outpointer >= agrep_outlen) {					OUTPUT_OVERFLOW;					return -1;				}				memcpy(agrep_outbuffer+agrep_outpointer, lastout, textend-lastout + 1);				agrep_outpointer += (textend - lastout + 1);			}		} /* TCOMPRESSED */	}	return 0;}/* initmask() initializes the mask table for the pattern                    */ /* endposition is a mask for the endposition of the pattern                 *//* endposition will contain k mask bits if the pattern contains k fragments */static voidinitmask(pattern, Mask, m, D, endposition)CHARTYPE *pattern; unsigned *Mask; register int m, D; unsigned *endposition;{	register unsigned Bit1, c;	register int i, j, frag_num;	/* Bit1 = 1 << 31;*/    /* the first bit of Bit1 is 1, others 0.  */	Bit1 = (unsigned)0x80000000;	frag_num = D+1; 	*endposition = 0;	for (i = 0; i < frag_num; i++) *endposition = *endposition | (Bit1 >> i);	*endposition = *endposition >> (m - frag_num);	for(i = 0; i < m; i++) 		if (pattern[i] == '^' || pattern[i] == '$') {			pattern[i] = '\n'; 		}	for(i = 0; i < MAXSYM; i++) Mask[i] = ~0;	for(i = 0; i < m; i++)     /* initialize the mask table */	{  		c = pattern[i];		for ( j = 0; j < m; j++)			if( c == pattern[j] )				Mask[c] = Mask[c] & ~( Bit1 >> j ) ;	}}static voidprep(Pattern, M, D)             /* preprocessing for partitioning_bm */CHARTYPE *Pattern;  /* can be fine-tuned to choose a better partition */register int M, D;{	register int i, j, k, p, shift;	register unsigned m;	unsigned hash, b_size = 3;	m = M/(D+1);	p = M - m*(D+1);	for (i = 0; i < MAXSYM; i++) SHIFT[i] = m;	for (i = M-1; i>=p ; i--) {		shift = (M-1-i)%m;		hash = Pattern[i];		if((int)(SHIFT[hash]) > (int)(shift)) SHIFT[hash] = shift;	}#ifdef DEBUG	for(i=0; i<M; i++) printf(" %d,", SHIFT[Pattern[i]]);	printf("\n");#endif	shift_1 = m;	for(i=0; i<D+1; i++) {		j = M-1 - m*i;		for(k=1; k<m; k++) {			for(p=0; p<D+1; p++) 				if(Pattern[j-k] == Pattern[M-1-m*p]) 					if(k < shift_1) shift_1 = k;		}	}#ifdef DEBUG	printf("\nshift_1 = %d", shift_1);#endif	if(shift_1 == 0) shift_1 = 1;	for(i=0; i<MAXMEMBER; i++) MEMBER[i] = 0;	if (m < 3) b_size = m;	for(i=0; i<D+1; i++) {		j = M-1 - m*i;		hash = 0;		for(k=0; k<b_size; k++) {			hash = (hash << 2) + Pattern[j-k];		}#ifdef DEBUG		printf(" hash = %d,", hash);#endif		MEMBER[hash] = 1;	}}intagrep( pat, M, text, textend, D, oldpat, oldM) int M, D, oldM; register CHARTYPE *text, *textend, *pat, *oldpat;{	register int i;	register int m = M/(D+1);	register CHARTYPE *textbegin;	CHARTYPE *textstart;	register int shift, HASH;	int  j=0, k, d1;	int  n, cdx;	int  Candidate[MaxCan][2], round, lastend=0;	unsigned R1[MaxError+1], R2[MaxError+1]; 	register unsigned int r1, endpos, c; 	unsigned currentpos;	unsigned Bit1;	unsigned r_newline;	int oldbyteoffset;	CHARTYPE *lastout = text;	int newlen;	Candidate[0][0] = Candidate[0][1] = 0; 	d1 = shift_1;	cdx = 0;	if(m < 3) r1 = m;	else r1 = 3;	textbegin = text;	shift = m-1;	while (text < textend) {		textstart = text;		shift = SHIFT[*(text += shift)];		while(shift) {			shift = SHIFT[*(text += shift)];			shift = SHIFT[*(text += shift)];		}		CurrentByteOffset += text - textstart;		j = 1; 		HASH = *text;		while(j < r1) { 			HASH = (HASH << 2) + *(text-j);			j++; 		}		if (MEMBER[HASH]) { 			i = text - textbegin;			if((i - M - D - 10) > Candidate[cdx][1]) { 					Candidate[++cdx][0] = i-M-D-2;				Candidate[cdx][1] = i+M+D; 			}			else Candidate[cdx][1] = i+M+D;			shift = d1;		}		else shift = d1;	}	CurrentByteOffset += (textbegin - text);	text = textbegin;	n = textend - textbegin;	r_newline = '\n';	/* for those candidate areas, find the D-error matches                     */	if(Candidate[1][0] < 0) Candidate[1][0] = 0;	endpos = endposition;                /* the mask table and the endposition */	/* Bit1 = (1 << 31); */	Bit1 = (unsigned)0x80000000;	oldbyteoffset = CurrentByteOffset;	for(round = 0; round <= cdx; round++)	{  		i = Candidate[round][0] ; 		if(Candidate[round][1] > n) Candidate[round][1] = n;		if(i < 0) i = 0;		CurrentByteOffset = oldbyteoffset+i;		R1[0] = R2[0] = ~0;		R1[1] = R2[1] = ~Bit1;		for(k = 1; k <= D; k++) R1[k] = R2[k] = (R1[k-1] >> 1) & R1[k-1];		while (i < Candidate[round][1])                     		{  			c = text[i++];			CurrentByteOffset ++;			if(c == r_newline) {				for(k = 0 ; k <= D; k++) R1[k] = R2[k] = (~0 );			}			r1 = Mask[c];			R1[0] = (R2[0] >> 1) | r1;			for(k=1; k<=D; k++)				R1[k] = ((R2[k] >> 1) | r1) & R2[k-1] & ((R1[k-1] & R2[k-1]) >> 1);			if((R1[D] & endpos) == 0) { 				num_of_matched++;				if(FILENAMEONLY) return 0; 				currentpos = i;				if(i <= lastend) {					CurrentByteOffset += lastend - i;					i = lastend;				}				else {					int oldcurrentpos = currentpos;					if (-1 == s_output(text, &currentpos, textbegin, textend, &lastout, pat, M, oldpat, oldM)) return -1;					CurrentByteOffset += currentpos - oldcurrentpos;					i = currentpos; 				}				lastend = i;				for(k=0; k<=D; k++) R1[k] = R2[k] = ~0;				if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) ||				    ((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) return 0;	/* done */			}			/* copying the code to save a few instructions.			you need to understand the shift-or algorithm			to figure this one... */			c = text[i++];			CurrentByteOffset ++;			if(c == r_newline) {				for(k = 0 ; k <= D; k++) R1[k] = R2[k] = (~0 );			}			r1 = Mask[c];			R2[0] = (R1[0] >> 1) | r1;			for(k = 1; k <= D; k++)				R2[k] = ((R1[k] >> 1) | r1) & R1[k-1] & ((R1[k-1] & R2[k-1]) >> 1);			if((R2[D] & endpos) == 0) { 				currentpos = i;				num_of_matched++;				if(FILENAMEONLY) return 0; 				if(i <= lastend) {					CurrentByteOffset += lastend - i;					i = lastend;				}				else {					int oldcurrentpos = currentpos;					if (-1 == s_output(text, &currentpos, textbegin, textend, &lastout, pat, M, oldpat, oldM)) return -1;					CurrentByteOffset += currentpos - oldcurrentpos;					i = currentpos; 				}				lastend = i;				for(k=0; k<=D; k++) R1[k] = R2[k] = ~0;				if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) ||				    ((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) return 0;	/* done */			}		}	}	if (!SILENT && INVERSE && !COUNT && (lastout <= textend)) {		if (TCOMPRESSED == ON) { /* INVERSE: Don't care about filtering time */			if (agrep_finalfp != NULL)				newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_finalfp, -1, EASYSEARCH);			else {				if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {					if (newlen + agrep_outpointer >= agrep_outlen) {						OUTPUT_OVERFLOW;						return -1;					}					agrep_outpointer += newlen;				}			}		}		else { /* NOT TCOMPRESSED */			if (agrep_finalfp != NULL)				fwrite(lastout, 1, textend-lastout + 1, agrep_finalfp);			else {				if (textend - lastout + 1 + agrep_outpointer >= agrep_outlen) {					OUTPUT_OVERFLOW;					return -1;				}				memcpy(agrep_outbuffer+agrep_outpointer, lastout, textend-lastout + 1);				agrep_outpointer += (textend - lastout + 1);			}		} /* TCOMPRESSED */	}	return 0;}/* Don't update CurrentByteOffset here: done by caller */ints_output(text, i, textbegin, textend, lastout, pat, m, oldpat, oldm) int *i;	/* in, out */int m, oldm; CHARTYPE *text, *textbegin, *textend, *pat, *oldpat;CHARTYPE **lastout;	/* in, out */{	int PRINTED = 0;	int newlen; 	int oldi;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -