📄 main.c
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. *//* bgopal: (1993-4) redesigned/rewritten using agrep's library interface */#include <sys/param.h>#include <errno.h>#include "glimpse.h"#include "defs.h"#include <fcntl.h>#include "checkfile.h"#include <sys/types.h>#include <sys/stat.h>#include <sys/time.h>#include <sys/file.h> /* for flock definition */#if ISO_CHAR_SET#include <locale.h> /* support for 8bit character set */#endif#define CLIENTSERVER 1#define USE_MSGHDR 0#define USE_UNIXDOMAIN 0#define DEBUG 0#define DEF_SERV_PORT 2001#define MIN_SERV_PORT 1024#define MAX_SERV_PORT 30000#define SERVER_QUEUE_SIZE 10 /* number of requests to buffer up while processing one request = 5 *//* Borrowed from C-Lib */extern char **environ;extern int errno;#if CLIENTSERVER#include "communicate.c"#endif /*CLIENTSERVER*//* For client-server protocol */CHAR *SERV_HOST = NULL;int SERV_PORT;char glimpse_reqbuf[MAX_ARGS*MAX_NAME_LEN];extern int glimpse_clientdied; /* set if signal received about dead socket: need agrep variable so that exec() can return quickly */int glimpse_reinitialize = 0;/* Borrowed from agrep.c */extern int D_length; /* global variable in agrep */extern int D; /* global variable in agrep */extern int pattern_index;/* These are used for byte level index search */extern CHAR CurrentFileName[MAX_LINE_LEN];extern int SetCurrentFileName;extern int CurrentByteOffset;extern int SetCurrentByteOffset;extern long CurrentFileTime;extern int SetCurrentFileTime;extern int execfd;extern int agrep_initialfd;extern CHAR *agrep_inbuffer;extern int agrep_inlen;extern int agrep_inpointer;extern FILE *agrep_finalfp;extern CHAR *agrep_outbuffer;extern int agrep_outlen;extern int agrep_outpointer;extern int glimpse_call; /* prevent agrep from printing out its usage */extern int glimpse_isserver; /* prevent agrep from asking for user input */int first_search = 1; /* intra/interaction in process_query() and glimpse_search() */#if ISSERVERint RemoteFiles = 0; /* Are the files present locally or remotely? If on, then -NQ is automatically added to all search options for each query */#endif/* Borrowed from index/io.c */extern int InfoAfterFilename;extern int OneFilePerBlock;extern int StructuredIndex;extern unsigned int *dest_index_set;extern unsigned char *dest_index_buf;extern unsigned int *src_index_set;extern unsigned char *src_index_buf;extern unsigned char *merge_index_buf;extern int mask_int[32];extern int indexable_char[256];int test_indexable_char[256];extern int p_table[MAX_PARTITION];extern int GMAX_WORD_SIZE;extern int IndexNumber; /* used in getword() */extern int InterpretSpecial; /* used to "not-split" agrep-regexps */extern int UseFilters; /* defined in build_in.c, used for filtering routines in io.c */extern int ByteLevelIndex;extern int RecordLevelIndex;extern int rdelim_len;extern char rdelim[MAX_LINE_LEN];extern char old_rdelim[MAX_LINE_LEN];extern int file_num;extern int REAL_PARTITION, REAL_INDEX_BUF, MAX_ALL_INDEX, FILEMASK_SIZE;/* Borrowed from get_filename.c */extern int bigbuffer_size;extern char *bigbuffer;extern char *outputbuffer;/* OPTIONS/FLAGS */int veryfast = 0;int CONTACT_SERVER = 0; /* Should client try to call server at all or just process query on its own? */int NOBYTELEVEL = 0; /* Some cases where we cannot do byte level fast-search: ALWAYS 0 if !ByteLevelIndex */int OPTIMIZEBYTELEVEL = 0; /* Some cases where we don't want to do byte level search since number of files is small */int GCONSTANT = 0; /* should pattern be taken as-is or parsed? */int GLIMITOUTPUT = 0; /* max no. of output lines: 0=>infinity=default=nolimit */int GLIMITTOTALFILE = 0; /* max no. of files to match: 0=>infinity=default=nolimit */int GLIMITPERFILE = 0; /* not used in glimpse */int GBESTMATCH = 0; /* Should I change -B to -# where # = no. of errors? */int GRECURSIVE = 0;int GNOPROMPT = 0;int GBYTECOUNT = 0;int GPRINTFILENUMBER = 0;int GPRINTFILETIME = 0;int GOUTTAIL = 0;int GFILENAMEONLY = 0; /* how to do it if it is an and expression in structured queries */int GNOFILENAME=0;int GPRINTNONEXISTENTFILE = 0; /* if filename is not there in index, then at least let user know its name */int MATCHFILE = 0;int PRINTATTR = 0;int PRINTINDEXLINE = 0;int Pat_as_is=0;int Only_first=0; /* Do index search only */int PRINTAPPXFILEMATCH=0; /* Print places in file where match occurs: useful with -b only to analyse the index */int GCOUNT=0; /* print number of matches rather than actual matches: used only when PRINTAPPX = 1 */int HINTSFROMUSER=0; /* The user gives the hints about where we should search (result of adding -EQNgy) */int WHOLEFILESCOPE=0; /* used only when foundattr is NOT set: otherwise, scope is whole file anyway */int foundattr=0; /* set in split.c -- != 0 only when StructuredIndex AND query is structured */int foundnot=0; /* set in split.c -- != 0 only when the not operator (~) is present in the pattern */int FILENAMESINFILE=0; /* whether the user is providing an explicit list of filenames to be searched for pattern (if absent, then means all files) */int BITFIELDFILE=0; /* Based on contribution From ada@mail2.umu.se Fri Jul 12 01:56 MST 1996; Christer Holgersson, Sen. SysNet Mgr, Umea University/SUNET, Sweden */int BITFIELDOFFSET=0;int BITFIELDLENGTH=0;int BITFIELDENDIAN=0;int GNumDays = 0; /* whether the user wants files modified within these many days before creating the index: only >0 makes sense *//* structured queries */CHAR ***attr_vals; /* matrix of char pointers: row=max #of attributes, col=max possible values */CHAR **attr_found; /* did the expression corr. to each value in attr_vals match? */ParseTree *GParse; /* what kind of expression corr. to attr are we looking for *//* arbitrary booleans */ParseTree terminals[MAXNUM_PAT]; /* parse tree's terminal node pointers pt. to elements of this array; also used outside */ char matched_terminals[MAXNUM_PAT]; /* ...[i] is 1 if i'th terminal matched: used in filter_output and eval_tree */int num_terminals; /* number of terminal patterns */int ComplexBoolean=0; /* 1 if we need to use parse trees and the eval function *//* index search */CHAR *pat_list[MAXNUM_PAT]; /* complete words within global pattern */int pat_lens[MAXNUM_PAT]; /* their lengths */int pat_attr[MAXNUM_PAT]; /* set of attributes */int is_mgrep_pat[MAXNUM_PAT];int mgrep_pat_index[MAXNUM_PAT];int num_mgrep_pat;CHAR pat_buf[(MAXNUM_PAT + 2)*MAXPAT];int pat_ptr = 0;extern char INDEX_DIR[MAX_LINE_LEN];char *TEMP_DIR = NULL; /* directory to store glimpse temporary files, usually /tmp unless -T is specified */char indexnumberbuf[256]; /* to read in first few lines of the index */char *index_argv[MAX_ARGS];int index_argc = 0;int bestmatcherrors=0; /* set during index search, used later on */int patindex; int patbufpos = -1;char tempfile[MAX_NAME_LEN];char *filenames_file = NULL;char *bitfield_file = NULL;/* agrep search */char *agrep_argv[MAX_ARGS];int agrep_argc = 0;CHAR *FileOpt; /* the option list after -F */int fileopt_length;CHAR GPattern[MAXPAT];int GM;CHAR APattern[MAXPAT];int AM;CHAR GD_pattern[MAXPAT];int GD_length;CHAR **GTextfiles;CHAR **GTextfilenames;int *GFileIndex;int GNumfiles;int GNumpartitions;CHAR GProgname[MAXNAME];/* persistent file descriptors */#if BG_DEBUGFILE *debug; /* file descriptor for debugging output */#endif /*BG_DEBUG*/FILE *timesfp = NULL;FILE *timesindexfp = NULL;FILE *indexfp = NULL; /* glimpse index */FILE *partfp = NULL; /* glimpse partitions */FILE *minifp = NULL; /* glimpse turbo */FILE *nullfp = NULL; /* to discard output: agrep -s doesn't work properly */int svstdin = 0, svstdout = 1, svstderr = 2;static int one = 1; /* to set socket option so that glimpseserver releases socket after death *//* Index manipulation */struct offsets **src_offset_table;struct offsets **multi_dest_offset_table[MAXNUM_PAT];unsigned int *multi_dest_index_set[MAXNUM_PAT];extern free_list();struct stat index_stat_buf, file_stat_buf;int timesindexsize = 0;int last_Y_filenumber = 0;/* Direct agrep access for bytelevel-indices */extern int COUNT, INVERSE, TCOMPRESSED, NOFILENAME, POST_FILTER, OUTTAIL, BYTECOUNT, SILENT, NEW_FILE, LIMITOUTPUT, LIMITPERFILE, LIMITTOTALFILE, PRINTRECORD, DELIMITER, SILENT, FILENAMEONLY, num_of_matched, prev_num_of_matched, FILEOUT;CHAR matched_region[MAX_REGION_LIMIT*2 + MAXPATT*2];int RegionLimit=DEFAULT_REGION_LIMIT;/* Returns number of matched records/lines. Uses agrep's options to output stuff nicely; never called with RecordLevelIndex set */intglimpse_search(AM, APattern, GD_length, GD_pattern, realfilename, filename, fileindex, src_offset_table, outfp) int AM; unsigned char APattern[]; int GD_length; unsigned char GD_pattern[]; char *realfilename; char *filename; int fileindex; struct offsets *src_offset_table[]; FILE *outfp;{ FILE *infp; char sig[SIGNATURE_LEN]; struct offsets **p1, *tp1; CHAR *text, *curtextend, *curtextbegin, c; int times; int num, ret, totalret = 0; int prevoffset = 0, begininterval = 0, endinterval = -1; CHAR *beginregionptr = 0, *endregionptr = 0; int beginpage = 0, endpage = -1; static int MAXTIMES, MAXPGTIMES, pagesize; static int first_time = 1; /* * If can't open file for read, quit * For each offset for that file: * seek to that point * go back until delimiter, go forward until delimiter, output it: MAX_REGION_LIMIT is 16K on either side. * read in units of RegionLimit * before outputting matched record, use options to put prefixes (or use memagrep which does everything?) * Algorithm changed: don't read same page in twice. */ if (first_time) { pagesize = DISKBLOCKSIZE; MAXTIMES = ((MAX_REGION_LIMIT / RegionLimit) > 1) ? (MAX_REGION_LIMIT / RegionLimit) : 1; MAXPGTIMES = ((MAX_REGION_LIMIT / pagesize) > 1) ? (MAX_REGION_LIMIT / pagesize) : 1; first_time = 0; } /* Safety: must end/begin with delim */ memcpy(matched_region, GD_pattern, GD_length); memcpy(matched_region+MAXPATT+2*MAX_REGION_LIMIT, GD_pattern, GD_length); text = &matched_region[MAX_REGION_LIMIT+MAXPATT]; if ((infp = my_fopen(filename, "r")) == NULL) return 0; NEW_FILE = ON;#if 0 /* Cannot search in .CZ files since offset computations will be incorrect */ TCOMPRESSED = ON; if (!tuncompressible_filename(file_list[i], strlen(file_list[i]))) TCOMPRESSED = OFF; num_read = fread(sig, 1, SIGNATURE_LEN, infp); if ((TCOMPRESSED == ON) && tuncompressible(sig, num_read)) { EASYSEARCH = sig[SIGNATURE_LEN-1]; if (!EASYSEARCH) { fprintf(stderr, "not compressed for easy-search: can miss some matches in: %s\n", CurrentFileName); /* not filename!!! */ } } else TCOMPRESSED = OFF;#endif /*0*/ p1 = &src_offset_table[fileindex]; while (*p1 != NULL) { if ( (begininterval <= (*p1)->offset) && (endinterval > (*p1)->offset) ) { /* already covered this area */#if DEBUG printf("ignoring %d in [%d,%d]\n", (*p1)->offset, begininterval, endinterval);#endif /*DEBUG*/ tp1 = *p1; *p1 = (*p1)->next; my_free(tp1, sizeof(struct offsets)); continue; } TCOMPRESSED = OFF;#if 1 if ( (beginpage <= (*p1)->offset) && (endpage >= (*p1)->offset) && (text + ((*p1)->offset - prevoffset) + GD_length < endregionptr)) { /* beginregionptr = curtextend - GD_length; /* prevent next curtextbegin to go behind previous curtextend (!) */ text += ((*p1)->offset - prevoffset); prevoffset = (*p1)->offset; if (!((curtextend = forward_delimiter(text, endregionptr, GD_pattern, GD_length, 1)) < endregionptr)) goto fresh_read; if (!((curtextbegin = backward_delimiter(text, beginregionptr, GD_pattern, GD_length, 0)) > beginregionptr)) goto fresh_read; } else { /* NOT within an area already read: must read another page: if record overlapps page, might read page twice: no time to fix */ fresh_read: prevoffset = (*p1)->offset; text = &matched_region[MAX_REGION_LIMIT+MAXPATT]; /* middle: points to occurrence of pattern */ endpage = beginpage = ((*p1)->offset / pagesize) * pagesize; /* endpage = (((*p1)->offset + pagesize) / pagesize) * pagesize */ endregionptr = beginregionptr = text - ((*p1)->offset - beginpage); /* overlay physical place starting from this logical point */ /* endregionptr = text + (endpage - (*p1)->offset); */ curtextbegin = curtextend = text; times = 0; while (times < MAXPGTIMES) { fseek(infp, endpage, 0); num = (&matched_region[MAX_REGION_LIMIT*2+MAXPATT] - endregionptr < pagesize) ? (&matched_region[MAX_REGION_LIMIT*2+MAXPATT] - endregionptr) : pagesize; if ((num = fread(endregionptr, 1, num, infp)) <= 0) break; endpage += num; endregionptr += num; if (endregionptr <= text) { curtextend = text; /* error in value of offset: file was modified and offsets no longer true: your RISK! */ break; } if (((curtextend = forward_delimiter(text, endregionptr, GD_pattern, GD_length, 1)) < endregionptr) || (endregionptr >= &matched_region[MAX_REGION_LIMIT*2 + MAXPATT])) break; times ++; } times = 0; while (times < MAXPGTIMES) { /* I have already read the initial page since endpage is beginpage initially */ if ((curtextbegin = backward_delimiter(text, beginregionptr, GD_pattern, GD_length, 0)) > beginregionptr) break; if (beginpage > 0) { if (beginregionptr - pagesize < &matched_region[MAXPATT]) { if ((num = beginregionptr - &matched_region[MAXPATT]) <= 0) break; } else num = pagesize; beginpage -= num; beginregionptr -= num; } else break; times ++; fseek(infp, beginpage, 0); fread(beginregionptr, 1, num, infp); } }#else /*1*/ /* Find forward delimiter (including delimiter) */ times = 0; fseek(infp, (*p1)->offset, 0); while (times < MAXTIMES) { if ((num = fread(text+RegionLimit*times, 1, RegionLimit, infp)) > 0) curtextend = forward_delimiter(text, text+RegionLimit*times+num, GD_pattern, GD_length, 1); if ((curtextend < text+RegionLimit*times+num) || (num < RegionLimit)) break; times ++; } /* Find backward delimiter (including delimiter) */ times = 0; while (times < MAXTIMES) { num = ((*p1)->offset - RegionLimit*(times+1)) > 0 ? ((*p1)->offset - RegionLimit*(times+1)) : 0; fseek(infp, num, 0); if (num > 0) { fread(text-RegionLimit*(times+1), 1, RegionLimit, infp); curtextbegin = backward_delimiter(text, text-RegionLimit*(times+1), GD_pattern, GD_length, 0); } else { fread(text-RegionLimit*times-(*p1)->offset, 1, (*p1)->offset, infp); curtextbegin = backward_delimiter(text, text-RegionLimit*times-(*p1)->offset, GD_pattern, GD_length, 0); } if ((num <= 0) || (curtextbegin > text-RegionLimit*(times+1))) break; times ++; }#endif /*1*/ /* set interval and delete the entry */ begininterval = (*p1)->offset - (text - curtextbegin); endinterval = (*p1)->offset + (curtextend - text); if (strncmp(curtextbegin, GD_pattern, GD_length)) { /* always pass enclosing delimiters to agrep; since we have seen text before curtextbegin + we have space, we can overwrite */ memcpy(curtextbegin - GD_length, GD_pattern, GD_length); curtextbegin -= GD_length; }#if DEBUG c = *curtextend; *curtextend = '\0'; printf("%s [%d < %d < %d], text = %d: %s\n", CurrentFileName, begininterval, (*p1)->offset, endinterval, text, curtextbegin); *curtextend = c;#endif /*DEBUG*/ tp1 = *p1; *p1 = (*p1)->next; my_free(tp1, sizeof(struct offsets)); if (curtextend <= curtextbegin) continue; /* error in offsets/delims */ /* * Don't call memagrep since that is heavy weight. Call exec * directly after doing agrep_search()'s preprocessing here. * PS: can add agrep variable not to do delim search if called from here * since that prevents unnecessarily scanning the buffer for the 2nd time. */ CurrentByteOffset = begininterval+1; SetCurrentByteOffset = 1; first_search = 1; if (first_search) { if ((ret = memagrep_search(AM, APattern, curtextend-curtextbegin, curtextbegin, 0, outfp)) > 0) totalret ++; /* += ret */ else if ((ret < 0) && (errno == AGREP_ERROR)) { fclose(infp); return -1; } first_search = 0; } else { /* All agrep globals are properly set: has a bug because agrep's globals aren't properly reinitialized without agrep_search :-( */ agrep_finalfp = (FILE *)outfp; agrep_outlen = 0; agrep_outbuffer = NULL; agrep_outpointer = 0; execfd = agrep_initialfd = -1; agrep_inbuffer = curtextbegin; agrep_inlen = curtextend - curtextbegin;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -