📄 sgrep.c
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */
/* #define DEBUG2 */
/* #define DEBUG */
/*
[chg] 05.10.96 sgrep():
- when the last buffer is read:
no need to look back to the previous CR,
just process until (end) = (buf_end)
[chg] bm():
- when there is a hit at the last line of a file,
and the file has not a CR as the last character,
now the print buffer is terminated with an artifical
CR.
[chg] 04.10.96 bm(): major bugs in the algorithm removed [TG]
[chg] 22.09.96 UL850[].lower_1 -> UL850.lower [TG]
[chg] 21.08.96 header file ISO_CHAR [TG]
[chg] 13.08.96 edited by Tom Gries <gries@ibm.net>
*/
#include <stdio.h>
#include <ctype.h>
#include "agrep.h"
#include "codepage.h"
extern unsigned char LUT[256];
#undef MAXSYM
#define MAXSYM 256
#define MAXMEMBER 8192
#define CHARTYPE unsigned char
#undef MaxError /* don't use agrep.h definition */
#define MaxError 20
#define MAXPATT 256
#undef MAXLINE
#define MAXLINE 1024
#undef MAXNAME
#define MAXNAME 256
#undef MaxCan /* don't use agrep.h definition */
#define MaxCan 2048
#define BLOCKSIZE 16384
#define MAX_SHIFT_2 4096
#undef ON
#define ON 1
#undef OFF
#define OFF 0
#define LOG_ASCII 8
#define LOG_DNA 3
#define MAXMEMBER_1 65536
#define LONG_EXAC 20
#define LONG_APPX 24
#if ISO_CHAR_SET
#define W_DELIM 256
#else
#define W_DELIM 128
#endif
#ifndef _WIN32
#include <sys/time.h>
#else
#include <sys/timeb.h>
#endif
extern int tuncompressible();
extern int quick_tcompress();
extern int quick_tuncompress();
extern int DELIMITER, OUTTAIL;
extern int D_length, tc_D_length;
extern unsigned char D_pattern[MaxDelimit *2], tc_D_pattern[MaxDelimit *2];
extern int LIMITOUTPUT, LIMITPERFILE, INVERSE;
extern int CurrentByteOffset;
extern int BYTECOUNT;
extern int PRINTOFFSET;
extern int PRINTRECORD;
extern int CONSTANT, COUNT, FNAME, SILENT, FILENAMEONLY, prev_num_of_matched, num_of_matched;
extern int DNA ; /* DNA flag is set in checksg when pattern is DNA pattern
and p_size > 16 */
extern WORDBOUND, WHOLELINE, NOUPPER;
extern unsigned char CurrentFileName[], Progname[];
extern unsigned Mask[];
extern unsigned endposition;
extern int agrep_inlen;
extern CHARTYPE *agrep_inbuffer;
extern int agrep_initialfd;
extern FILE *agrep_finalfp;
extern int agrep_outpointer;
extern int agrep_outlen;
extern CHARTYPE * agrep_outbuffer;
extern int NEW_FILE, POST_FILTER;
extern int EXITONERROR;
#ifndef __BORLANDC__
extern int errno;
#endif
extern int TCOMPRESSED;
extern int EASYSEARCH;
extern char FREQ_FILE[MAX_LINE_LEN], HASH_FILE[MAX_LINE_LEN], STRING_FILE[MAX_LINE_LEN];
#ifdef _WIN32
int fill_buf(); /* bitap.c */
int a_monkey(); /* sgrep.c */
int agrep(); /* sgrep.c */
int bm(); /* sgrep.c */
int blog(); /* sgrep.c */
int monkey(); /* sgrep.c */
int monkey4(); /* sgrep.c */
int s_output(); /* sgrep.c */
int verify(); /* sgrep.c */
#endif
#if MEASURE_TIMES
/* timing variables */
extern int OUTFILTER_ms;
extern int FILTERALGO_ms;
extern int INFILTER_ms;
#endif /*MEASURE_TIMES*/
unsigned char BSize; /* log_c m */
unsigned char char_map[MAXSYM];
/* data area */
int shift_1;
CHARTYPE SHIFT[MAXSYM];
CHARTYPE MEMBER[MAXMEMBER];
CHARTYPE pat[MAXPATT];
unsigned Hashmask;
char MEMBER_1[MAXMEMBER_1];
CHARTYPE TR[MAXSYM];
static void initmask();
static void am_preprocess();
static void m_preprocess();
static void prep();
static void prep4();
static void prep_bm();
/*
* General idea behind output processing with delimiters, inverse, compression, etc.
* CAUTION: In compressed files, we can search ONLY for simple patterns or their ;,.
* Attempts to search for complex patterns / with errors might lead to spurious matches.
* 1. Once we find the match, go back and forward to get the delimiters that surround
* the matched region.
* 2. If it is a compressed file, verify that the match is "real" (compressed files
* can have pseudo matches hence this filtering step is required).
* 3. Increment num_of_matched.
* 4. Process some output options which print stuff before the matched region is
* printed.
* 5. If there is compression, decomress and output the matched region. Otherwise
* just output it as is. Remember, from step (1) we know the matched region.
* 6. If inverse is set, then we must keep track of the end of the last matched region
* in the variable lastout. When there is a match, we must print everything from
* lastout to the beginning of the current matched region (curtextbegin) and then
* update lastout to point to the end of the current matched region (curtextend).
* ALSO: if we exit from the main loops, we must output everything from the end
* of the last matched region to the end of the input buffer.
* 7. Delimiter handling in complex patterns is different: there the search is done
* for a boolean and of the delimiter pattern and the actual pattern.
*/
/* skips over escaped characters */
unsigned char *
mystrchr(s, c)
unsigned char *s;
int c;
{
unsigned char *t = s;
while (*t) {
if (*t == '\\') t++;
else if (c == *t) return t;
t ++;
}
return NULL;
}
void
char_tr(pat, m)
unsigned char *pat;
int *m;
{
int i;
unsigned char temp[MAXPATT];
for(i=0; i<MAXSYM; i++) TR[i] = i;
/* if(NOUPPER) [TG] */ {
for(i=0; i<MAXSYM; i++)
#if ((defined(__EMX__) || defined(_WIN32)) && defined(ISO_CHAR_SET))
TR[i] = TR[LUT[i]];
#else
if (isupper(i)) TR[i] = TR[tolower(i)];
#endif
}
/*
if(WORDBOUND) {
for(i=0; i<MAXSYM; i++) {
if(!isalnum(i)) TR[i] = W_DELIM; removed by Udi.
we don't use the trick of making the boundary W_delim anymore.
It's too buggy otherwise and it's not necessary.
}
}
removed by bg 11/8/94
*/
if(WHOLELINE) {
memcpy(temp, pat, *m);
pat[0] = '\n';
memcpy(pat+1, temp, *m);
pat[*m+1] = '\n';
pat[*m+2] = 0;
*m = *m + 2;
}
}
int sgrep(in_pat, in_m, fd, D, samepattern)
CHARTYPE *in_pat;
int fd, in_m, D;
{
CHARTYPE patbuf[MAXLINE];
CHARTYPE *pat = patbuf;
int m = in_m;
CHARTYPE *text; /* input text stream */
int offset = 2*MAXLINE;
int buf_end, num_read, i, start, end, residue = 0;
int first_time = 1;
CHARTYPE *oldpat = pat;
int k, j, oldm = m;
static CHARTYPE newpat[MAXLINE]; /* holds compressed version */
static int newm;
#if MEASURE_TIMES
static struct timeval initt, finalt;
#endif
CHARTYPE *tempbuf;
int oldCurrentByteOffset;
strncpy(pat, in_pat, MAXLINE);
pat[MAXLINE-1] = '\0';
#define PROCESS_PATTERN \
if (!CONSTANT) {\
if( (pat[0] == '^') || (pat[0] == '$') ) pat[0] = '\n';\
if ((m>1) && (pat[m-2] != '\\') && ((pat[m-1] == '^') || (pat[m-1] == '$'))) pat[m-1] = '\n';\
}\
/* whether constant or not, interpret the escape character */\
for (k=0; k<m; k++) {\
if (pat[k] == '\\') {\
for (j=k; j<m; j++)\
pat[j] = pat[j+1]; /* including '\0' */\
m--;\
}\
}\
char_tr(pat, &m); /* will change pat, and m if WHOLELINE is ON */\
if(m >= MAXPATT) {\
fprintf(stderr, "%s: pattern too long (has > %d chars)\n", Progname, MAXPATT);\
if (!EXITONERROR) {\
errno = AGREP_ERROR;\
return -1;\
}\
else exit(2);\
}\
if(D == 0) {\
if(m > LONG_EXAC) m_preprocess(pat);\
else prep_bm(pat, m);\
}\
else if (DNA) prep4(pat, m);\
else if(m >= LONG_APPX) am_preprocess(pat);\
else {\
prep(pat, m, D);\
initmask(pat, Mask, m, 0, &endposition);\
}
#if AGREP_POINTER
if (fd != -1) {
#endif /*AGREP_POINTER*/
alloc_buf(fd, &text, 2*BLOCKSIZE+2*MAXLINE+MAXPATT);
text[offset-1] = '\n'; /* initial case */
for(i=0; i < MAXLINE; i++) text[i] = 0; /* security zone */
start = offset;
if(WHOLELINE) {
start--;
CurrentByteOffset --;
}
while( (num_read = fill_buf(fd, text+offset, 2*BLOCKSIZE)) > 0)
{
buf_end = end = offset + num_read -1 ;
oldCurrentByteOffset = CurrentByteOffset;
if (first_time) {
if ((TCOMPRESSED == ON) && tuncompressible(text+offset, num_read)) {
EASYSEARCH = text[offset+SIGNATURE_LEN-1];
start += SIGNATURE_LEN;
CurrentByteOffset += SIGNATURE_LEN;
if (!EASYSEARCH) {
fprintf(stderr, "not compressed for easy-search: can miss some matches in: %s\n", CurrentFileName);
}
#if MEASURE_TIMES
gettimeofday(&initt, NULL);
#endif /*MEASURE_TIMES*/
if (samepattern || ((newm = quick_tcompress(FREQ_FILE, HASH_FILE, pat, m, newpat, MAXLINE-8, EASYSEARCH)) > 0)) {
oldm = m;
oldpat = pat;
m = newm;
pat = newpat;
}
#if MEASURE_TIMES
gettimeofday(&finalt, NULL);
INFILTER_ms += (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);
#endif /*MEASURE_TIMES*/
}
else TCOMPRESSED = OFF;
PROCESS_PATTERN /* must be AFTER we know that it is a compressed pattern... */
/* to make sure the skip loop in bm() won't go out of bound in later iterations:
This was the original code.
It will be inefficient to place a copy of the pattern at the end
of the buffer. Better put that stop pattern to the end of the
_actual_ read block (end) [TG] 04.10.96 */
/* for(i=1; i<=m; i++) text[2*BLOCKSIZE+offset+i] = pat[m-1]; [del] [TG] */
/* Emergency Stop: put one copy of pattern to the end of the buffer
to make sure that the skip loop in bm()
won't go out of bound in later iterations */
/* save portion being overwritten.
copied from below (memagrep()), but not need here: */
/* memcpy(tempbuf, text+end+1, m); */
for(i=1; i<=m; i++) text[end+i] = pat[m-1]; /* [new] [TG] */
first_time = 0;
}
if (!DELIMITER) {
/* [TG] */ if (num_read == 2*BLOCKSIZE) {
while ((text[end] != '\n') && (end > offset)) end--;
}
/* else end = buf_end; no need to look back [TG] */
text[start-1] = '\n';
}
else {
unsigned char *newbuf = text + end + 1;
newbuf = backward_delimiter(newbuf, text+offset, D_pattern, D_length, OUTTAIL); /* see agrep.c/'d' */
if (newbuf < text+offset+D_length) newbuf = text + end + 1;
end = newbuf - text - 1;
memcpy(text+start-D_length, D_pattern, D_length);
}
residue = buf_end - end + 1 ;
/* SGREP_PROCESS */
/* No harm in sending a few extra parameters even if they are unused: they are not accessed in monkey*()s */
if(D==0) {
if(m > LONG_EXAC) {
if (-1 == monkey(pat, m, text+start, text+end, oldpat, oldm)) {
free_buf(fd, text);
return -1;
}
}
else {
if (-1 == bm(pat, m, text+start, text+end, oldpat, oldm)) {
free_buf(fd, text);
return -1;
}
}
}
else {
if(DNA) {
if (-1 == monkey4( pat, m, text+start, text+end, D , oldpat, oldm )) {
free_buf(fd, text);
return -1;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -