⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 glimpse.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. *//* ./glimpse/index/glimpse.c */#include "glimpse.h"#include <stdlib.h>#include <sys/time.h>#if	ISO_CHAR_SET#include <locale.h>	/* support for 8bit character set:ew@senate.be */#endifextern char **environ;extern int errno;extern FILE *TIMEFILE;	/* file descriptor for sorting .glimpse_filenames by time */#if	BG_DEBUGextern FILE  *LOGFILE; 	/* file descriptor for LOG output */#endif	/*BG_DEBUG*/extern FILE  *STATFILE;	/* file descriptor for statistical data about indexed files */extern FILE  *MESSAGEFILE;	/* file descriptor for important messages meant for the user */extern char  INDEX_DIR[MAX_LINE_LEN];extern struct stat istbuf;extern char	*TEMP_DIR; /* directory to store glimpse temporary files, usually /tmp unless -T is specified */#ifdef BUILDCAST/* TEMP_DIR is normally defined in ../main.c; if we're building * buildcast, that's not linked in, so we need to define one here. *//* char * TEMP_DIR = NULL; */static char * TEMP_DIR = "/tmp";#endif /* BUILDCAST */extern int indexable_char[256];extern int GenerateHash;extern int KeepFilenames;extern int OneFilePerBlock;extern int IndexNumber;extern int CountWords;extern int StructuredIndex;extern int attr_num;extern int MAXWORDSPERFILE;extern int NUMERICWORDPERCENT;extern int AddToIndex;extern int DeleteFromIndex;extern int PurgeIndex;extern int FastIndex;extern int BuildDictionary;extern int BuildDictionaryExisting;extern int CompressAfterBuild;extern int IncludeHigherPriority;extern int FilenamesOnStdin;extern int ExtractInfo;extern int InfoAfterFilename;extern int FirstWordOfInfoIsKey;extern int UseFilters;extern int ByteLevelIndex;extern int RecordLevelIndex;extern int StoreByteOffset;extern char rdelim[MAX_LINE_LEN];extern char old_rdelim[MAX_LINE_LEN];extern int rdelim_len;/* extern int IndexUnderscore; */extern int IndexableFile;extern int MAX_PER_MB, MAX_INDEX_PERCENT;extern int I_THRESHOLD;extern int BigHashTable;extern int BigFilenameHashTable;extern int IndexEverything;extern int BuildTurbo;extern int SortByTime;extern int AddedMaxWordsMessage;extern int AddedMixedWordsMessage;extern int file_num;extern int old_file_num;extern int new_file_num;extern int file_id;extern int part_num;extern char **name_list[MAXNUM_INDIRECT];extern int p_table[MAX_PARTITION];extern int  *size_list[MAXNUM_INDIRECT];extern int p_size_list[];extern unsigned int *disable_list;extern int memory_usage;extern int mask_int[];extern int REAL_PARTITION, REAL_INDEX_BUF, MAX_ALL_INDEX, FILEMASK_SIZE;extern struct indices	*deletedlist;extern char sync_path[MAX_LINE_LEN];extern int ATLEASTONEFILE;extern set_usemalloc();	/* compress/misc.c */char IProgname[MAX_LINE_LEN];int ModifyFilenamesIndex = 0;/* * Has newnum crossed the boundary of an encoding? This is so rare that we * needn't optimize it by changing the format of the old index and reusing it. */cross_boundary(oldnum, newnum)	int	oldnum, newnum;{	int	ret;	if (oldnum <= 0) return 0;	ret =  ( ((oldnum <= MaxNum8bPartition) && (newnum > MaxNum8bPartition)) ||		 ((oldnum <= MaxNum12bPartition) && (newnum > MaxNum12bPartition)) ||		 ((oldnum <= MaxNum16bPartition) && (newnum > MaxNum16bPartition)) );	if (ret) fprintf(MESSAGEFILE, "Must change index format. Commencing fresh indexing...\n");	return ret;}determine_sync(){	char	S[1024], s1[256], s2[256];	FILE	*fp;	int	i, ret;	strcpy(sync_path, "sync");	sprintf(S, "exec whereis sync > %s/zz.%d", TEMP_DIR,getpid());	/* Change it to use which: not urgent. */	system(S);	sprintf(S, "%s/zz.%d", TEMP_DIR,getpid());	if ((fp = fopen(S, "r")) == NULL) {		/* printf("11111\n"); */		return 0;	}	if ((ret = fread(S, 1, sizeof(S)-1, fp)) <= 0) {		sprintf(S, "%s/zz.%d", TEMP_DIR,getpid());		unlink(S);		fclose(fp);		/* printf("22222\n"); */		return 0;	}        S [ret] = 0; /* terminate string */	sprintf(s1, "%s/zz.%d", TEMP_DIR,getpid());	unlink(s1);	fclose(fp);	/* printf("read: %s\n", S); */	sscanf(S, "%s%s", s1, s2);	/* printf("s1=%s s2=%s\n", s1, s2); */	if (strncmp(s1, "sync", 4)) {		/* printf("33333\n"); */		return 0;	}	if (!strcmp(s2, "") || !strcmp(s2, " ")) {		/* printf("44444\n"); */		return 0;	}	if (strstr(s2, "sync") == NULL) {		/* printf("55555\n"); */		return 0;	}	strcpy(sync_path, s2);	/* printf("Using sync in: %s\n", sync_path); */	return 1;}main(argc, argv)int argc;char **argv;{     int pid = getpid();    int	i, m = 0;    char *indexdir, es1[MAX_LINE_LEN], es2[MAX_LINE_LEN];    char s[MAX_LINE_LEN], s1[MAX_LINE_LEN];    char working_dir[MAX_LINE_LEN];    FILE *tmpfp;    char hash_file[MAX_LINE_LEN], string_file[MAX_LINE_LEN], freq_file[MAX_LINE_LEN];    char tmpbuf[1024];    struct stat stbuf;    char name[MAX_LINE_LEN];    char outname[MAX_LINE_LEN];    int specialwords, threshold;    int backup;    struct indices *get_removed_indices();    struct timeval tv;#if	ISO_CHAR_SET    setlocale(LC_ALL,""); /* support for 8bit character set: ew@senate.be, Henrik.Martin@eua.ericsson.se */#endif    BuildDictionary = ON;    set_usemalloc();    srand(pid);    umask(077);    determine_sync();    INDEX_DIR[0] = '\0';    specialwords = threshold = -1;	/* so that compute_dictionary can use defaults not visible here */    strncpy(IProgname, argv[0], MAX_LINE_LEN);    memset(size_list, '\0', sizeof(int *) * MAXNUM_INDIRECT);	/* free it once partition successfully calculates p_size_list */    memset(name_list, '\0', sizeof(char **) * MAXNUM_INDIRECT);    memset(p_size_list, '\0', sizeof(int) * MAX_PARTITION);    build_filename_hashtable((char *)NULL, 0);    /*     * Process options.     */    while (argc > 1) {	if (strcmp(argv[1], "-help") == 0) {	    return usage(1);	}#if	!BUILDCAST	else if (strcmp(argv[1], "-R") == 0) {	    ModifyFilenamesIndex = 1;	    argc --; argv ++;	}	else if (strcmp(argv[1], "-V") == 0) {	    printf("\nThis is glimpseindex version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);	    return(0);	}	else if (strcmp(argv[1], "-T") == 0) {	    BuildTurbo = ON;	    argc --; argv ++;	}	else if (strcmp(argv[1], "-I") == 0) {	    IndexableFile = ON;	    argc --; argv ++;	}	else if(strcmp(argv[1], "-a") == 0) {	    AddToIndex = ON;	    argc--; argv++;	}	else if(strcmp(argv[1], "-b") == 0) {	    ByteLevelIndex = ON;	    argc--; argv++;	}	else if(strcmp(argv[1], "-O") == 0) {	    StoreByteOffset = ON;	    argc--; argv++;	}	else if(strcmp(argv[1], "-r") == 0) {	    ByteLevelIndex = ON;	    RecordLevelIndex = ON;	    if (argc <= 2) {		fprintf(stderr, "The -r option must be followed by a delimiter\n");		return usage(1);	    }	    else {		strncpy(rdelim, argv[2], MAX_LINE_LEN);		rdelim[MAX_LINE_LEN-1] = '\0';		rdelim_len = strlen(rdelim);		strcpy(old_rdelim, rdelim);		argc -= 2; argv += 2;	    }	}	else if(strcmp(argv[1], "-c") == 0) {	    CountWords = ON;	    argc--; argv++;	}	else if(strcmp(argv[1], "-d") == 0) {	    DeleteFromIndex = ON;	    argc --; argv ++;	}	else if(strcmp(argv[1], "-D") == 0) {	    PurgeIndex = OFF;	    argc --; argv ++;	}	else if(strcmp(argv[1], "-f") == 0) {	    FastIndex = ON;	    argc--; argv++;	}	else if (strcmp(argv[1], "-o") == 0) {	    OneFilePerBlock = ON;	    argc --; argv ++;	}	else if (strcmp(argv[1], "-s") == 0) {	    StructuredIndex = ON;	    argc --; argv ++;	}	else if(strcmp(argv[1], "-z") == 0) {	    UseFilters = ON;	    argc--; argv++;	}	else if(strcmp(argv[1], "-t") == 0) {	    SortByTime = ON;	    argc--; argv++;	}	else if (strcmp(argv[1], "-C") == 0) {		BigFilenameHashTable = 1;		argc --; argv ++;	}#else	/*!BUILDCAST*/	else if (strcmp(argv[1], "-V") == 0) {	    printf("\nThis is buildcast version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);	    return(0);	}	else if(strcmp(argv[1], "-C") == 0) {	    CompressAfterBuild = ON;	    argc --; argv ++;	}	else if(strcmp(argv[1], "-E") == 0) {	    BuildDictionaryExisting = ON;	    argc --; argv ++;	}	else if (strcmp(argv[1], "-t") == 0) {	    if ((argc <= 2) || !(isdigit(argv[2][0]))) {		return usage(1);	    }	    else {		threshold = atoi(argv[2]);		argc -= 2; argv += 2;	    }	}	else if (strcmp(argv[1], "-l") == 0) {	    if ((argc <= 2) || !(isdigit(argv[2][0]))) {		return usage(1);	    }	    else {		specialwords = atoi(argv[2]);		argc -= 2; argv += 2;	    }	}#endif	/*!BUILDCAST*/	else if (strcmp(argv[1], "-M") == 0) {	    if (argc == 2) {		fprintf(stderr, "-M should be followed by the amount of memory in MB for indexing words\n");		return usage(1);	    }	    m = atoi(argv[2]);	    if (m < 1) {		fprintf(stderr, "Ignoring -M %d (< 1 MB). Using default value of about 2 MB\n", m);		return usage(1);	    }	    else {		/*		 * Calculate I_THRESHOLD approximately. Note: 2*1024*1024*2 / (2*24 + 32 + 12) = 47662, DEF_I_THRESHOLD = 40000, so OK		 * N * sizeofindices + N*(avgwordlen + sizeoftoken)/indicespertoken <= mem		 * elemsperset = occurrences/indicespertoken		 * N <= mem * occurrences / (sizeofindices*indicespertoken + avgwordlen + sizeoftoken)		 */		I_THRESHOLD = m * 1024 * 1024 * (INDICES_PER_TOKEN) /				(INDICES_PER_TOKEN * sizeof(struct indices) + sizeof(struct token) + AVG_WORD_LEN);		fprintf(stderr, "Using %d words as threshold before merge\n", I_THRESHOLD/INDICES_PER_TOKEN);	    }	    argc -= 2; argv += 2;	}	else if (strcmp(argv[1], "-w") == 0) {	    if (argc == 2) {		fprintf(stderr, "-w should be followed by the number of words\n");		return usage(1);	    }	    MAXWORDSPERFILE = atoi(argv[2]);	    argc -= 2; argv += 2;	}	else if (strcmp(argv[1], "-S") == 0) {	    if (argc == 2) {		fprintf(stderr, "-S should be followed by the stop list limit\n");		return usage(1);	    }	    MAX_PER_MB = MAX_INDEX_PERCENT = atoi(argv[2]);	    argc -= 2; argv += 2;	}	else if(strcmp(argv[1], "-n") == 0) {	    IndexNumber = ON;	    if ((argc <= 2) || !(isdigit(argv[2][0]))) {	/* -n has no arg */		argc --; argv ++;	    }	    else {		NUMERICWORDPERCENT = atoi(argv[2]);		if ((NUMERICWORDPERCENT > 100) || (NUMERICWORDPERCENT < 0)) {		    fprintf(stderr, "The percentage of numeric words must be in [0..100]\n");		    return usage(1);		}		argc-=2; argv+=2;	    }	}	else if(strcmp(argv[1], "-h") == 0) {	    /* I want to generate .glimpse_filehash and .glimpse_filehash_index */	    GenerateHash = ON;	    argc --; argv ++;	}	else if(strcmp(argv[1], "-i") == 0) {	    IncludeHigherPriority = ON;	    argc --; argv ++;	}	else if(strcmp(argv[1], "-k") == 0) {	    /* I want to know what files were there before: used in SFS to compute new sets from old ones */	    KeepFilenames = ON;	    argc --; argv ++;	}	else if (strcmp(argv[1], "-B") == 0) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -