📄 sgrep.c
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */#include <stdio.h>#include <ctype.h>#include "agrep.h"#include <errno.h>#undef MAXSYM#define MAXSYM 256#define MAXMEMBER 8192#define CHARTYPE unsigned char#undef MaxError /* don't use agrep.h definition */#define MaxError 20#define MAXPATT 256#undef MAXLINE#define MAXLINE 1024#undef MAXNAME#define MAXNAME 256#undef MaxCan /* don't use agrep.h definition */#define MaxCan 2048#define BLOCKSIZE 16384#define MAX_SHIFT_2 4096#undef ON#define ON 1#undef OFF#define OFF 0#define LOG_ASCII 8#define LOG_DNA 3#define MAXMEMBER_1 65536#define LONG_EXAC 20#define LONG_APPX 24#if ISO_CHAR_SET#define W_DELIM 256#else#define W_DELIM 128#endif#include <sys/time.h>extern int tuncompressible();extern int quick_tcompress();extern int quick_tuncompress();extern int DELIMITER, OUTTAIL;extern int D_length, tc_D_length;extern unsigned char D_pattern[MaxDelimit *2], tc_D_pattern[MaxDelimit *2];extern int LIMITOUTPUT, LIMITPERFILE, INVERSE;extern int CurrentByteOffset;extern int BYTECOUNT;extern int PRINTOFFSET;extern int PRINTRECORD;extern int CONSTANT, COUNT, FNAME, SILENT, FILENAMEONLY, prev_num_of_matched, num_of_matched, PRINTFILETIME;extern int DNA ; /* DNA flag is set in checksg when pattern is DNA pattern and p_size > 16 */extern WORDBOUND, WHOLELINE, NOUPPER;extern unsigned char CurrentFileName[], Progname[]; extern long CurrentFileTime;extern unsigned Mask[];extern unsigned endposition;extern int agrep_inlen;extern CHARTYPE *agrep_inbuffer;extern int agrep_initialfd;extern FILE *agrep_finalfp;extern int agrep_outpointer;extern int agrep_outlen;extern CHARTYPE * agrep_outbuffer;extern int NEW_FILE, POST_FILTER;extern int EXITONERROR;extern int errno;extern int TCOMPRESSED;extern int EASYSEARCH;extern char FREQ_FILE[MAX_LINE_LEN], HASH_FILE[MAX_LINE_LEN], STRING_FILE[MAX_LINE_LEN];#if MEASURE_TIMES/* timing variables */extern int OUTFILTER_ms;extern int FILTERALGO_ms;extern int INFILTER_ms;#endif /*MEASURE_TIMES*/unsigned char BSize; /* log_c m */unsigned char char_map[MAXSYM];/* data area */int shift_1;CHARTYPE SHIFT[MAXSYM];CHARTYPE MEMBER[MAXMEMBER];CHARTYPE pat[MAXPATT];unsigned Hashmask;char MEMBER_1[MAXMEMBER_1];CHARTYPE TR[MAXSYM];static void initmask();static void am_preprocess();static void m_preprocess();static void prep();static void prep4();static void prep_bm();/* * General idea behind output processing with delimiters, inverse, compression, etc. * CAUTION: In compressed files, we can search ONLY for simple patterns or their ;,. * Attempts to search for complex patterns / with errors might lead to spurious matches. * 1. Once we find the match, go back and forward to get the delimiters that surround * the matched region. * 2. If it is a compressed file, verify that the match is "real" (compressed files * can have pseudo matches hence this filtering step is required). * 3. Increment num_of_matched. * 4. Process some output options which print stuff before the matched region is * printed. * 5. If there is compression, decomress and output the matched region. Otherwise * just output it as is. Remember, from step (1) we know the matched region. * 6. If inverse is set, then we must keep track of the end of the last matched region * in the variable lastout. When there is a match, we must print everything from * lastout to the beginning of the current matched region (curtextbegin) and then * update lastout to point to the end of the current matched region (curtextend). * ALSO: if we exit from the main loops, we must output everything from the end * of the last matched region to the end of the input buffer. * 7. Delimiter handling in complex patterns is different: there the search is done * for a boolean and of the delimiter pattern and the actual pattern. *//* skips over escaped characters */unsigned char *mystrchr(s, c)unsigned char *s;int c;{ unsigned char *t = s; while (*t) { if (*t == '\\') t++; else if (c == *t) return t; t ++; } return NULL;}voidchar_tr(pat, m)unsigned char *pat;int *m;{ int i; unsigned char temp[MAXPATT]; for(i=0; i<MAXSYM; i++) TR[i] = i; if(NOUPPER) { for(i=0; i<MAXSYM; i++) if (isupper(i)) TR[i] = TR[tolower(i)]; /* for(i='A'; i<= 'Z'; i++) TR[i] = i + 'a' - 'A'; */ } /* if(WORDBOUND) { for(i=0; i<MAXSYM; i++) { if(!isalnum(i)) TR[i] = W_DELIM;removed by Udi. we don't use the trick of making the boundary W_delim anymore. It's too buggy otherwise and it's not necessary. } } removed by bg 11/8/94 */ if(WHOLELINE) { memcpy(temp, pat, *m); pat[0] = '\n'; memcpy(pat+1, temp, *m); pat[*m+1] = '\n'; pat[*m+2] = 0; *m = *m + 2; }}intsgrep(in_pat, in_m, fd, D, samepattern)CHARTYPE *in_pat; int fd, in_m, D;{ CHARTYPE patbuf[MAXLINE]; CHARTYPE *pat = patbuf; int m = in_m; CHARTYPE *text; /* input text stream */ int offset = 2*Max_record; int buf_end, num_read, i, start, end, residue = 0; int first_time = 1; CHARTYPE *oldpat = pat; int k, j, oldm = m; static CHARTYPE newpat[MAXLINE]; /* holds compressed version */ static int newm;#if MEASURE_TIMES static struct timeval initt, finalt;#endif CHARTYPE *tempbuf; int oldCurrentByteOffset; strncpy(pat, in_pat, MAXLINE); pat[MAXLINE-1] = '\0';#define PROCESS_PATTERN \ if (!CONSTANT) {\ if( (pat[0] == '^') || (pat[0] == '$') ) pat[0] = '\n';\ if ((m>1) && (pat[m-2] != '\\') && ((pat[m-1] == '^') || (pat[m-1] == '$'))) pat[m-1] = '\n';\ }\ /* whether constant or not, interpret the escape character */\ for (k=0; k<m; k++) {\ if (pat[k] == '\\') {\ for (j=k; j<m; j++)\ pat[j] = pat[j+1]; /* including '\0' */\ m--;\ }\ }\ char_tr(pat, &m); /* will change pat, and m if WHOLELINE is ON */\ if(m >= MAXPATT) {\ fprintf(stderr, "%s: pattern too long (has > %d chars)\n", Progname, MAXPATT);\ if (!EXITONERROR) {\ errno = AGREP_ERROR;\ return -1;\ }\ else exit(2);\ }\ if(D == 0) {\ if(m > LONG_EXAC) m_preprocess(pat);\ else prep_bm(pat, m);\ }\ else if (DNA) prep4(pat, m);\ else if(m >= LONG_APPX) am_preprocess(pat);\ else {\ prep(pat, m, D);\ initmask(pat, Mask, m, 0, &endposition);\ }#if AGREP_POINTER if (fd != -1) {#endif /*AGREP_POINTER*/ alloc_buf(fd, &text, 2*BlockSize+2*Max_record+MAXPATT); text[offset-1] = '\n'; /* initial case */ for(i=0; i < Max_record; i++) text[i] = 0; /* security zone */ start = offset; if(WHOLELINE) { start--; CurrentByteOffset --; } while( (num_read = fill_buf(fd, text+offset, 2*BlockSize)) > 0) { buf_end = end = offset + num_read -1 ; oldCurrentByteOffset = CurrentByteOffset; if (first_time) { if ((TCOMPRESSED == ON) && tuncompressible(text+offset, num_read)) { EASYSEARCH = text[offset+SIGNATURE_LEN-1]; start += SIGNATURE_LEN; CurrentByteOffset += SIGNATURE_LEN; if (!EASYSEARCH) { fprintf(stderr, "not compressed for easy-search: can miss some matches in: %s\n", CurrentFileName); }#if MEASURE_TIMES gettimeofday(&initt, NULL);#endif /*MEASURE_TIMES*/ if (samepattern || ((newm = quick_tcompress(FREQ_FILE, HASH_FILE, pat, m, newpat, Max_record-8, EASYSEARCH)) > 0)) { oldm = m; oldpat = pat; m = newm; pat = newpat; }#if MEASURE_TIMES gettimeofday(&finalt, NULL); INFILTER_ms += (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);#endif /*MEASURE_TIMES*/ } else TCOMPRESSED = OFF; PROCESS_PATTERN /* must be AFTER we know that it is a compressed pattern... */ for(i=1; i<=m; i++) text[2*BlockSize+offset+i] = pat[m-1]; /* to make sure the skip loop in bm() won't go out of bound in later iterations */ first_time = 0; } if (!DELIMITER) { while ((text[end] != '\n') && (end > offset)) end--; text[start-1] = '\n'; } else { unsigned char *newbuf = text + end + 1; newbuf = backward_delimiter(newbuf, text+offset, D_pattern, D_length, OUTTAIL); /* see agrep.c/'d' */ if (newbuf < text+offset+D_length) newbuf = text + end + 1; end = newbuf - text - 1; memcpy(text+start-D_length, D_pattern, D_length); } residue = buf_end - end + 1 ; /* SGREP_PROCESS */ /* No harm in sending a few extra parameters even if they are unused: they are not accessed in monkey*()s */ if(D==0) { if(m > LONG_EXAC) { if (-1 == monkey(pat, m, text+start, text+end, oldpat, oldm)) { free_buf(fd, text); return -1; } } else { if (-1 == bm(pat, m, text+start, text+end, oldpat, oldm)) { free_buf(fd, text); return -1; } } } else { if(DNA) { if (-1 == monkey4( pat, m, text+start, text+end, D , oldpat, oldm )) { free_buf(fd, text); return -1; } } else { if(m >= LONG_APPX) { if (-1 == a_monkey(pat, m, text+start, text+end, D, oldpat, oldm)) { free_buf(fd, text); return -1; } } else { if (-1 == agrep(pat, m, text+start, text+end, D, oldpat, oldm)) { free_buf(fd, text); return -1; } } } } if(FILENAMEONLY && (num_of_matched - prev_num_of_matched) && (NEW_FILE || !POST_FILTER)) { if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%s", CurrentFileName); else { int outindex; for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) && (CurrentFileName[outindex] != '\0'); outindex++) { agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex]; } if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) { OUTPUT_OVERFLOW; free_buf(fd, text); return -1; } agrep_outpointer += outindex; } if (PRINTFILETIME) { char *s = aprint_file_time(CurrentFileTime); if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "%s", s); else { int outindex; for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) && (s[outindex] != '\0'); outindex++) { agrep_outbuffer[agrep_outpointer+outindex] = s[outindex]; } if ((s[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) { OUTPUT_OVERFLOW; free_buf(fd, text); return -1; } agrep_outpointer += outindex; } } if (agrep_finalfp != NULL) fprintf(agrep_finalfp, "\n"); else { if (agrep_outpointer+1>=agrep_outlen) { OUTPUT_OVERFLOW; free_buf(fd, text); return -1; } else agrep_outbuffer[agrep_outpointer++] = '\n'; } free_buf(fd, text); NEW_FILE = OFF; return 0; } CurrentByteOffset = oldCurrentByteOffset + end - start + 1; /* for a new iteration: avoid complicated calculations below */ start = offset - residue ; if(start < Max_record) { start = Max_record; } /* strncpy(text+start, text+end, residue); */ memcpy(text+start, text+end, residue); start++; if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) || ((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) { free_buf(fd, text); return 0; /* done */ } } /* end of while(num_read = ...) */ if (!DELIMITER) { text[start-1] = '\n'; text[start+residue] = '\n'; } else { if (start > D_length) memcpy(text+start-D_length, D_pattern, D_length); memcpy(text+start+residue, D_pattern, D_length); } end = start + residue - 2; if(residue > 1) { /* SGREP_PROCESS */ /* No harm in sending a few extra parameters even if they are unused: they are not accessed in monkey*()s */ if(D==0) { if(m > LONG_EXAC) { if (-1 == monkey(pat, m, text+start, text+end, oldpat, oldm)) { free_buf(fd, text); return -1; } } else { if (-1 == bm(pat, m, text+start, text+end, oldpat, oldm)) { free_buf(fd, text); return -1;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -