⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 newmgrep.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
		if (!SILENT) {                if (agrep_finalfp != NULL) {                        while(lastout <= textend) fputc(*lastout++, agrep_finalfp);                }                else {                        if (textend - lastout + 1 + agrep_outpointer >= agrep_outlen) {                                OUTPUT_OVERFLOW;                                return -1;                        }                        memcpy(agrep_outbuffer+agrep_outpointer, lastout, text-lastout+1);                        agrep_outpointer += (text-lastout+1);                        lastout = textend;                }		}        }        return 0;}#if	DOTCOMPRESSED/* shift is always 1: slight change in MATCHED semantics: it is set to 1 even if COUNT is set: previously, it wasn't set. Will it effect m_short? */inttc_m_short(text, start, end)int start, end; register uchar *text;{	int m1=1;	int PRINTED = 0;	int pat_index;        unsigned char *oldtext;	register uchar *textend;	unsigned char *textbegin;	unsigned char *curtextend;	unsigned char *curtextbegin;	register int p, p_end;	int MATCHED=0;	/* int OUT=0; */	uchar *lastout;	uchar *qx;	uchar *px;	int j;	int DOWITHMASK;	struct timeval initt, finalt;	int newlen;	DOWITHMASK = 0;	if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);	textend = text + end;	lastout = text + start;	text = text + start - 1 ;	textbegin = text + start;	/* WORDBOUND adjustment not required */	while (++text <= textend) {		CurrentByteOffset ++;		p = tc_HASH[tc_tr[*text]];		p_end = tc_HASH[tc_tr[*text]+1];		while(p++ < p_end) {			if (((pat_index = tc_pat_indices[p]) <= 0) || (tc_pat_len[pat_index] <= 0)) continue;#ifdef	debug			printf("m_short(): p=%d pat_index=%d off=%d\n", p, pat_index, textend - text);#endif			px = tc_PatPtr[p];			qx = text;			while((*px!=0)&&(tc_tr[*px] == tc_tr[*qx])) {				px++;				qx++;			}			if (*px == 0) {				if(text >= textend) return 0;				if (!DOWITHMASK) {					/* Don't update CurrentByteOffset here: only before outputting properly */					if (!DELIMITER) {						curtextbegin = text; while((curtextbegin > textbegin) && (*(--curtextbegin) != '\n'));						if (*curtextbegin == '\n') curtextbegin ++;						curtextend = curtextbegin /*text-m1*/; while((curtextend < textend) && (*curtextend != '\n')) curtextend ++;						if (*curtextend == '\n') curtextend ++;					}					else {						curtextbegin = backward_delimiter(text, textbegin, tc_D_pattern, tc_D_length, OUTTAIL);						curtextend = forward_delimiter(curtextbegin /*text-m1*/, textend, tc_D_pattern, tc_D_length, OUTTAIL);					}				}				/* else prev curtextbegin is OK: if full AND isn't found, DOWITHMASK is 0-ed so that we search at most 1 line below */#if	MEASURE_TIMES				gettimeofday(&initt, NULL);#endif	/*MEASURE_TIMES*/				/* Was it really a match in the compressed line from prev line in text to text + strlen(tc_pat_len[pat_index]? */				if (-1 == exists_tcompressed_word(tc_PatPtr[p], tc_pat_len[pat_index], curtextbegin, text - curtextbegin + tc_pat_len[pat_index], EASYSEARCH))					goto skip_output;#if     MEASURE_TIMES				gettimeofday(&finalt, NULL);				FILTERALGO_ms +=  (finalt.tv_sec *1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);#endif  /*MEASURE_TIMES*/				if (!DOWITHMASK) {					if (!OUTTAIL || INVERSE) textbegin = curtextend;					else if (DELIMITER) textbegin = curtextend - D_length;					else textbegin = curtextend - 1;				}				DOWITHMASK = 1;				if (pat_index <= anum_terminals) {					int	iii;					amatched_terminals[pat_index - 1] = 1;					for (iii=0; iii<anum_terminals; iii++)						if (aduplicates[pat_index - 1][iii])							amatched_terminals[iii] = 1;				}				if (AComplexBoolean) {					/* Can output only after all the matches in the current record have been identified: just like filter_output */					oldtext = text;					CurrentByteOffset += (oldtext + pat_len[pat_index] - 1 - text);					text = oldtext + pat_len[pat_index] - 1;					MATCHED = 0;					goto skip_output;				}				else if ((long)AParse & AND_EXP) {					for (j=0; j<anum_terminals; j++) if (!amatched_terminals[j]) break;					if (j<anum_terminals) goto skip_output;				}				MATCHED = 1;				oldtext = text; /* used only if MULTI_OUTPUT */#undef	DO_OUTPUT#define DO_OUTPUT(change_text)\				num_of_matched++;\				if(FILENAMEONLY || SILENT)  return 0;\				if (!COUNT) {\					if ((PRINTED = print_options(pat_index, text, curtextbegin, curtextend)) == -1) return -1;\					if(!INVERSE) {\						if (PRINTRECORD) {\/* #if     MEASURE_TIMES\						gettimeofday(&initt, NULL);\*/ /*#endif  MEASURE_TIMES*/\						if (agrep_finalfp != NULL)\							newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, curtextbegin, curtextend-curtextbegin, agrep_finalfp, -1, EASYSEARCH);\						else {\							if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, curtextbegin, curtextend-curtextbegin, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {\								if (newlen + agrep_outpointer >= agrep_outlen) {\									OUTPUT_OVERFLOW;\									return -1;\								}\								agrep_outpointer += newlen;\							}\						}\/*#if     MEASURE_TIMES\						gettimeofday(&finalt, NULL);\						OUTFILTER_ms +=  (finalt.tv_sec* 1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);\*/ /*#endif  MEASURE_TIMES*/\						}\						else if (PRINTED) {\							if (agrep_finalfp != NULL) fputc('\n', agrep_finalfp);\							else agrep_outbuffer[agrep_outpointer ++] = '\n';\							PRINTED = 0;\						}\						if ((change_text) && MULTI_OUTPUT) {     /* next match starting from end of current */\							CurrentByteOffset += (oldtext + tc_pat_len[pat_index] - 1 - text);\							text = oldtext + tc_pat_len[pat_index] - 1;\							MATCHED = 0;\						}\						else if (change_text) {\							CurrentByteOffset += textbegin - text;\							text = textbegin;\						}\					}\					else {	/* INVERSE: Don't care about filtering time */\						/* if(lastout < curtextbegin) OUT=1; */\						if (!SILENT) {\						if (agrep_finalfp != NULL)\							newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_finalfp, -1, EASYSEARCH);\						else {\							if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, curtextbegin - lastout, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {\								if (newlen + agrep_outpointer >= agrep_outlen) {\									OUTPUT_OVERFLOW;\									return -1;\								}\								agrep_outpointer += newlen;\							}\						}\						}\						lastout=textbegin;\						if (change_text) {\							CurrentByteOffset += textbegin - text;\							text = textbegin;\						}\					}\				}\				else if (change_text) {\					CurrentByteOffset += textbegin - text;\					text = textbegin;\				}\				if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) ||\				    ((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) return 0;	/* done */\				DO_OUTPUT(1)			}		skip_output:                        if(MATCHED && !MULTI_OUTPUT && !AComplexBoolean) break;     /* else look for more possible matches */			if (DOWITHMASK && (text >= curtextend - 1)) {				DOWITHMASK = 0;				if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {					DO_OUTPUT(0)				}				if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);			}		}		/* If I found some match and I am about to cross over a delimiter, then set DOWITHMASK to 0 and zero out the amatched_terminals */		if (DOWITHMASK && (text >= curtextend - 1)) {			DOWITHMASK = 0;			if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {				DO_OUTPUT(0)			}			if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);		}		if (MATCHED) text--;		MATCHED = 0;	} /* while */	CurrentByteOffset ++;	/* Do residual stuff: check if there was a match at the end of the line | check if rest of the buffer needs to be output due to inverse */	if (DOWITHMASK && (text >= curtextend - 1)) {		DOWITHMASK = 0;		if (AComplexBoolean && dd(curtextbegin, curtextend) && eval_tree(AParse, amatched_terminals)) {			DO_OUTPUT(0)		}		if (AParse != 0) memset(amatched_terminals, '\0', anum_terminals);	}	if (INVERSE && !COUNT && (lastout <= textend)) {		if (!SILENT) {		if (agrep_finalfp != NULL)			newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_finalfp, -1, EASYSEARCH);		else {			if ((newlen = quick_tuncompress(FREQ_FILE, STRING_FILE, lastout, textend - lastout + 1, agrep_outbuffer, agrep_outlen - agrep_outpointer, EASYSEARCH)) > 0) {				if (newlen + agrep_outpointer >= agrep_outlen) {					OUTPUT_OVERFLOW;					return -1;				}				agrep_outpointer += newlen;			}		}		}	}        return 0;}#endif	/*DOTCOMPRESSED*/static voidf_prep(pat_index, Pattern)uchar *Pattern;   int pat_index;{int i, m;register unsigned hash=0;#ifdef debug	puts(Pattern);#endif	m = p_size;		for (i=m-1; i>=(1+LONG); i--) {				hash = (tr1[Pattern[i]]);				hash = (hash << Hbits) + (tr1[Pattern[i-1]]);		if(LONG) hash = (hash << Hbits) + (tr1[Pattern[i-2]] );		if(SHIFT1[hash] >= m-1-i) SHIFT1[hash] = m-1-i;	}	i=m-1;		hash = (tr1[Pattern[i]]);		hash = (hash << Hbits) + (tr1[Pattern[i-1]]);	if(LONG) hash = (hash << Hbits) + (tr1[Pattern[i-2]] );		if(SHORT) hash=tr[Pattern[0]];#ifdef debug	printf("hash = %d\n", hash);#endif		HASH[hash]++;		return;}#if	DOTCOMPRESSEDstatic voidtc_f_prep(pat_index, Pattern)uchar *Pattern;   int pat_index;{int i, m;register unsigned hash=0;#ifdef debug	puts(Pattern);#endif	m = tc_p_size;		for (i=m-1; i>=(1+tc_LONG); i--) {				hash = (tc_tr1[Pattern[i]]);				hash = (hash << Hbits) + (tc_tr1[Pattern[i-1]]);		if(tc_LONG) hash = (hash << Hbits) + (tc_tr1[Pattern[i-2]] );		if(tc_SHIFT1[hash] >= m-1-i) tc_SHIFT1[hash] = m-1-i;	}	i=m-1;		hash = (tc_tr1[Pattern[i]]);		hash = (hash << Hbits) + (tc_tr1[Pattern[i-1]]);	if(tc_LONG) hash = (hash << Hbits) + (tc_tr1[Pattern[i-2]] );		if(tc_SHORT) hash=tc_tr[Pattern[0]];#ifdef debug	printf("hash = %d\n", hash);#endif		tc_HASH[hash]++;		return;}#endif	/*DOTCOMPRESSED*/static voidf_prep1(pat_index, Pattern)uchar *Pattern;   int pat_index;{int i, m;int hash2;register unsigned hash;	m = p_size;#ifdef debug	puts(Pattern);#endif		for (i=m-1; i>=(1+LONG); i--) {				hash = (tr1[Pattern[i]]);				hash = (hash << Hbits) + (tr1[Pattern[i-1]]);		if(LONG) hash = (hash << Hbits) + (tr1[Pattern[i-2]] );		if(SHIFT1[hash] >= m-1-i) SHIFT1[hash] = m-1-i;	}	i=m-1;		hash = (tr1[Pattern[i]]);		hash = (hash << Hbits) + (tr1[Pattern[i-1]]);	if(LONG) hash = (hash << Hbits) + (tr1[Pattern[i-2]] );		if(SHORT) hash=tr[Pattern[0]];	hash2 = (tr[Pattern[0]] << 8) + tr[Pattern[1]];#ifdef debug	printf("hash = %d, HASH[hash] = %d\n", hash, HASH[hash]);#endif		PatPtr[HASH[hash]] = Pattern;		pat_indices[HASH[hash]] = pat_index;	Hash2[HASH[hash]] = hash2;		HASH[hash]--;		return;}#if	DOTCOMPRESSEDstatic voidtc_f_prep1(pat_index, Pattern)uchar *Pattern;   int pat_index;{int i, m;int hash2;register unsigned hash;	m = tc_p_size;#ifdef debug	puts(Pattern);#endif		for (i=m-1; i>=(1+tc_LONG); i--) {				hash = (tc_tr1[Pattern[i]]);				hash = (hash << Hbits) + (tc_tr1[Pattern[i-1]]);		if(tc_LONG) hash = (hash << Hbits) + (tc_tr1[Pattern[i-2]] );		if(tc_SHIFT1[hash] >= m-1-i) tc_SHIFT1[hash] = m-1-i;	}	i=m-1;		hash = (tc_tr1[Pattern[i]]);		hash = (hash << Hbits) + (tc_tr1[Pattern[i-1]]);	if(tc_LONG) hash = (hash << Hbits) + (tc_tr1[Pattern[i-2]] );		if(tc_SHORT) hash=tc_tr[Pattern[0]];	hash2 = (tc_tr[Pattern[0]] << 8) + tc_tr[Pattern[1]];#ifdef debug	printf("hash = %d, tc_HASH[hash] = %d\n", hash, tc_HASH[hash]);#endif		tc_PatPtr[tc_HASH[hash]] = Pattern;		tc_pat_indices[tc_HASH[hash]] = pat_index;	tc_Hash2[tc_HASH[hash]] = hash2;		tc_HASH[hash]--;		return;}#endif	/*DOTCOMPRESSED*/static voidaccumulate(){	int i;	for(i=1; i<MAXHASH; i++)  {	/*	printf("%d, ", HASH[i]);	*/	HASH[i] = HASH[i-1] + HASH[i];	}	HASH[0] = 0;	return;}#if	DOTCOMPRESSEDstatic voidtc_accumul

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -