⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 glimpse.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
		BigHashTable = 1;		argc --; argv ++;	}	else if (strcmp(argv[1], "-E") == 0) {		IndexEverything = 1;	/* without doing stat tests, etc. */		argc --; argv ++;	}	else if(strcmp(argv[1], "-F") == 0) {	    FilenamesOnStdin = ON;	    argc--; argv++;	}	else if(strcmp(argv[1], "-X") == 0) {	/* extract some info to append after a ' ' to filename in filename-buffer */	    ExtractInfo = ON;	    argc--; argv++;	}	else if(strcmp(argv[1], "-U") == 0) {	/* some information is there after blank after filename on same line as filename-buffer: makes sense only with -F */	    InfoAfterFilename = ON;	    argc--; argv++;	}	else if(strcmp(argv[1], "-K") == 0) {	/* first word of above info/or the extracted info is the key */	    FirstWordOfInfoIsKey = ON;	    argc--; argv++;	}	/*	else if(strcmp(argv[1], "-u") == 0) {	    IndexUnderscore = ON;	    argc--; argv++;	}	*/	else if (strcmp(argv[1], "-H") == 0) {	    if (argc == 2) {		fprintf(stderr, "-H should be followed by a directory name\n");		return usage(1);	    }	    strncpy(INDEX_DIR, argv[2], MAX_LINE_LEN);	    argc -= 2; argv += 2;	}	else break;	/* rest are directory names */    }    if (RecordLevelIndex && StructuredIndex) {	fprintf(stderr, "-r and -s are not compatible!\n");	return usage(1);    }    if (StoreByteOffset && !RecordLevelIndex) {	fprintf(stderr, "ignoring -O since -r was not specified\n");	StoreByteOffset = OFF;    }    if (InfoAfterFilename && !FilenamesOnStdin) {	fprintf(stderr, "-U works only when -F is specified!\n");	return usage(1);    }    if (FirstWordOfInfoIsKey && !(InfoAfterFilename || ExtractInfo)) {	fprintf(stderr, "-K works only when one of -X or -U are specified!\n");	return usage(1);    }    if (RecordLevelIndex) {	/* printf("old_rdelim = %s rdelim = %s rdelim_len = %d\n", old_rdelim, rdelim, rdelim_len); */	preprocess_delimiter(rdelim, rdelim_len, rdelim, &rdelim_len);	/* printf("processed rdelim = %s rdelim_len = %d\n", rdelim, rdelim_len); */    }    if (ModifyFilenamesIndex) {	int	offset = 0;	char	buffer[1024];	FILE	*filefp, *indexfp;	sprintf(buffer, "%s/%s", INDEX_DIR, NAME_LIST);	if ((filefp = fopen(buffer, "r")) == NULL) {	    fprintf(stderr, "Cannot open %s for reading\n", buffer);	    exit(2);	}	sprintf(buffer, "%s/%s.tmp", INDEX_DIR, NAME_LIST_INDEX);	if ((indexfp = fopen(buffer, "w")) == NULL) {	    fprintf(stderr, "Cannot open %s for writing\n", buffer);	    exit(2);	}	fgets(buffer, 1024, filefp);	/* skip over num. of file names */	offset += strlen(buffer);	while (fgets(buffer, 1024, filefp) != NULL) {		putc((offset & 0xff000000) >> 24, indexfp);		putc((offset & 0xff0000) >> 16, indexfp);		putc((offset & 0xff00) >> 8, indexfp);		putc((offset & 0xff), indexfp);		offset += strlen(buffer);	}	fflush(filefp);	fclose(filefp);	fflush(indexfp);	fclose(indexfp);#if	SFS_COMPAT	sprintf(s, "%s/%s.tmp", INDEX_DIR, NAME_LIST_INDEX);	sprintf(s1, "%s/%s", INDEX_DIR, NAME_LIST_INDEX);	return rename(s, s1);#else	sprintf(buffer, "mv %s/%s.tmp %s/%s", INDEX_DIR, NAME_LIST_INDEX, INDEX_DIR, NAME_LIST_INDEX);	return system(buffer);#endif    }    BuildTurbo = ON;	/* always ON: user can remove .glimpse_turbo if not needed */    /*     * Look for invalid option combos.     */    if ((argc<=1) && (!FilenamesOnStdin) && !FastIndex) {	return usage(1);    }    if (DeleteFromIndex && (AddToIndex || CountWords || IndexableFile)) {	/* With -f, it is automatic for files not found in OS but present in index; without it, an explicit set of files is required as argument on cmdline */	fprintf(stderr, "-d cannot be used with -I, -a or -c (see man pages)\n");	exit(1);    }    if (ByteLevelIndex) {	if (MAX_PER_MB <= 0) {	    fprintf(stderr, "Stop list limit (#of occurrences per MB) '%d' must be > 0\n", MAX_PER_MB);	    exit(1);	}    }    else if (OneFilePerBlock) {	if ((MAX_INDEX_PERCENT <= 0) || (MAX_INDEX_PERCENT > 100)) {	    fprintf(stderr, "Stop list limit (%% of occurrences in files) '%d' must be in (0, 100]\n", MAX_INDEX_PERCENT);	    exit(1);	}    }    /*     * Find the index directory since it is used in all options.     */    if (INDEX_DIR[0] == '\0') {	if ((indexdir = getenv("HOME")) == NULL) {	    getcwd(INDEX_DIR, MAX_LINE_LEN-1);	    fprintf(stderr, "Using working-directory '%s' to store index\n\n", INDEX_DIR);	}	else strncpy(INDEX_DIR, indexdir, MAX_LINE_LEN);    }    getcwd(working_dir, MAX_LINE_LEN - 1);    if (-1 == chdir(INDEX_DIR)) {	fprintf(stderr, "Cannot change directory to %s\n", INDEX_DIR);	return usage(0);    }    getcwd(INDEX_DIR, MAX_LINE_LEN - 1);	/* must be absolute path name */    chdir(working_dir);	/* get back to where you were */    if (IndexableFile) {	/* traverse the given directories and output names of files that are indexable on stdout */	SortByTime = OFF;	partition(argc, argv);	return 0;    }    else {#if	BUILDCAST	printf("\nThis is buildcast version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);#else	/*BUILDCAST*/	printf("\nThis is glimpseindex version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);#endif	/*BUILDCAST*/    }    if (ByteLevelIndex) {#if	0	/* We'll worry about these things later */	if (AddToIndex || DeleteFromIndex || FastIndex) {	    fprintf(stderr, "Fresh indexing recommended: -a, -d and -f are not supported with -b as yet\n");	    exit(1);	}	AddToIndex = FastIndex = OFF;#endif	CountWords = OFF;	OneFilePerBlock = ON;    }    if (SortByTime) {	if (DeleteFromIndex || AddToIndex) {	    fprintf(stderr, "Fresh indexing recommended: -a and -d are not supported with -t as yet\n");	    exit(1);	}	FastIndex = OFF;	/* automatically shuts it off as of now: we shall optimize -t with -f later */    }    /*     * CONVENTION: all the relevant output is on stdout; warnings/errors are on stderr.     * Initialize / open important files.     */    read_filters(INDEX_DIR, UseFilters);    freq_file[0] = hash_file[0] = string_file[0] = '\0';    strcpy(freq_file, INDEX_DIR);    strcat(freq_file, "/");    strcat(freq_file, DEF_FREQ_FILE);    strcpy(hash_file, INDEX_DIR);    strcat(hash_file, "/");    strcat(hash_file, DEF_HASH_FILE);    strcpy(string_file, INDEX_DIR);    strcat(string_file, "/");    strcat(string_file, DEF_STRING_FILE);    initialize_tuncompress(string_file, freq_file, 0);    sprintf(s, "%s/%s", INDEX_DIR, DEF_TIME_FILE);    if((TIMEFILE = fopen(s, "w")) == 0) {	fprintf(stderr, "can't open %s for writing\n", s);	exit(2);    }#if	BG_DEBUG    sprintf(s, "%s/%s", INDEX_DIR, DEF_LOG_FILE);    if((LOGFILE = fopen(s, "w")) == 0) {	fprintf(stderr, "can't open %s for writing\n", s);	LOGFILE = stderr;    }#endif	/*BG_DEBUG*/    sprintf(s, "%s/%s", INDEX_DIR, DEF_MESSAGE_FILE);    if((MESSAGEFILE = fopen(s, "w")) == 0) {	fprintf(stderr, "can't open %s for writing\n", s);	MESSAGEFILE = stderr;    }    sprintf(s, "%s/%s", INDEX_DIR, DEF_STAT_FILE);    if((STATFILE = fopen(s, "a")) == 0) {	fprintf(stderr, "can't open %s for appending\n", s);	STATFILE = stderr;    }    gettimeofday(&tv, NULL);#if	BUILDCAST    fprintf(STATFILE, "\nThis is buildcast version %s, %s. %s", GLIMPSE_VERSION, GLIMPSE_DATE, ctime(&tv.tv_sec));#else    fprintf(STATFILE, "\nThis is glimpseindex version %s, %s. %s", GLIMPSE_VERSION, GLIMPSE_DATE, ctime(&tv.tv_sec));#endif#if	BG_DEBUG    fprintf(LOGFILE, "Index Directory = %s\n\n", INDEX_DIR);#endif	/*BG_DEBUG*/    if (MAXWORDSPERFILE != 0) fprintf(MESSAGEFILE, "Index: maximum number of indexed words per file = %d\n", MAXWORDSPERFILE);    else fprintf(MESSAGEFILE, "Index: maximum number of indexed words per file = infinity\n");    fprintf(MESSAGEFILE, "Index: maximum percentage of numeric words per file = %d\n", NUMERICWORDPERCENT);    set_indexable_char(indexable_char);#if	BUILDCAST    CountWords = ON;    AddToIndex = OFF;    FastIndex = OFF;    /* Save old search-dictionaries */    sprintf(s, "%s/.glimpse_index", INDEX_DIR);    if (!access(s, R_OK)) {	sprintf(s, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);	if (-1 == mkdir(s, 0700)) {	    fprintf(stderr, "cannot create temporary directory %s\n", s);	    return -1;	}#if	SFS_COMPAT	sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE);	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);	rename(s, s1);#else	sprintf(s, "exec %s -f '%s/%s' '%s/.glimpse_tempdir.%d'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), INDEX_FILE, escapesinglequote(INDEX_DIR, es2), pid);	system(s);#endif#if	SFS_COMPAT	sprintf(s, "%s/%s", INDEX_DIR, P_TABLE);	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);	rename(s, s1);#else	sprintf(s, "exec %s -f '%s/%s' '%s/.glimpse_tempdir.%d'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), P_TABLE, escapesinglequote(INDEX_DIR, es2), pid);	system(s);#endif#if	SFS_COMPAT	sprintf(s, "%s/%s", INDEX_DIR, NAME_LIST);	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);	rename(s, s1);#else	sprintf(s, "exec %s -f '%s/%s' '%s/.glimpse_tempdir.%d'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), NAME_LIST, escapesinglequote(INDEX_DIR, es2), pid);	system(s);#endif#if	SFS_COMPAT	sprintf(s, "%s/%s", INDEX_DIR, NAME_LIST_INDEX);	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);	rename(s, s1);#else	sprintf(s, "exec %s -f '%s/%s' '%s/.glimpse_tempdir.%d'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), NAME_LIST_INDEX, escapesinglequote(INDEX_DIR, es1), pid);	system(s);#endif#if	SFS_COMPAT	sprintf(s, "%s/%s", INDEX_DIR, NAME_HASH);	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);	rename(s, s1);#else	sprintf(s, "exec %s -f '%s/%s' '%s/.glimpse_tempdir.%d'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), NAME_HASH, escapesinglequote(INDEX_DIR, es2), pid);	system(s);#endif#if	SFS_COMPAT	sprintf(s, "%s/%s", INDEX_DIR, NAME_HASH_INDEX);	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);	rename(s, s1);#else	sprintf(s, "exec %s -f '%s/%s' '%s/.glimpse_tempdir.%d'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), NAME_HASH_INDEX, escapesinglequote(INDEX_DIR, es2), pid);	system(s);#endif#if	SFS_COMPAT	sprintf(s, "%s/%s", INDEX_DIR, MINI_FILE);	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);	rename(s, s1);#else	sprintf(s, "exec %s -f '%s/%s' '%s/.glimpse_tempdir.%d'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), MINI_FILE, escapesinglequote(INDEX_DIR, es2), pid);	system(s);#endif#if	SFS_COMPAT	sprintf(s, "%s/%s", INDEX_DIR, DEF_STAT_FILE);	sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);	rename(s, s1);#else	sprintf(s, "exec %s -f '%s/%s' '%s/.glimpse_tempdir.%d'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), DEF_STAT_FILE, escapesinglequote(INDEX_DIR, es2), pid);	system(s);#endif	/* Don't save messages, log, debug, etc. */	sprintf(s, "%s/.glimpse_attributes", INDEX_DIR);	if (!access(s, R_OK)) {#if	SFS_COMPAT	    sprintf(s, "%s/%s", INDEX_DIR, ATTRIBUTE_FILE);	    sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);	    rename(s, s1);#else	    sprintf(s, "exec %s -f '%s/%s' '%s/.glimpse_tempdir.%d'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), ATTRIBUTE_FILE, escapesinglequote(INDEX_DIR, es2), pid);	    system(s);#endif	}    }    /* Backup old cast-dictionaries: don't use move since indexing might want to use them */    sprintf(s, "%s/.glimpse_quick", INDEX_DIR);    if (!access(s, R_OK)) {	/* there are previous cast dictionaries */	backup = rand();	sprintf(s, "%s/.glimpse_backup.%x", INDEX_DIR, backup);	if (-1 == mkdir(s, 0700)) {	    fprintf(stderr, "cannot create backup directory %s\n", s);	    return -1;	}	sprintf(s, "exec %s -f '%s/.glimpse_quick' '%s/.glimpse_backup.%x'\n", SYSTEM_CP, escapesinglequote(INDEX_DIR, es1), escapesinglequote(INDEX_DIR, es2), backup);	system(s);	sprintf(s, "exec %s -f '%s/.glimpse_compress' '%s/.glimpse_backup.%x'\n", SYSTEM_CP, escapesinglequote(INDEX_DIR, es1), escapesinglequote(INDEX_DIR, es2), backup);	system(s);	sprintf(s, "exec %s -f '%s/.glimpse_compress.index' '%s/.glimpse_backup.%x'\n", SYSTEM_CP, escapesinglequote(INDEX_DIR, es1), escapesinglequote(INDEX_DIR, es2), backup);	system(s);	sprintf(s, "exec %s -f '%s/.glimpse_uncompress' '%s/.glimpse_backup.%x'\n", SYSTEM_CP, escapesinglequote(INDEX_DIR, es1), escapesinglequote(INDEX_DIR, es2), backup);	system(s);	sprintf(s, "exec %s -f '%s/.glimpse_uncompress.index' '%s/.glimpse_backup.%x'\n", SYSTEM_CP, escapesinglequote(INDEX_DIR, es1), escapesinglequote(INDEX_DIR, es2), backup);	system(s);	printf("Saved previous cast-dictionary in %s/.glimpse_backup.%x\n", INDEX_DIR, backup);    }    /* Now index these files, and build new dictionaries */    partition(argc, argv);    initialize_data_structures(file_num);    old_file_num = file_num;    build_index();    cleanup();    save_data_structures();    destroy_filename_hashtable();    uninitialize_common();    uninitialize_tcompress();    uninitialize_tuncompress();    compute_dictionary(threshold, DISKBLOCKSIZE, specialwords, INDEX_DIR);    if (CompressAfterBuild) {	/* For the new compression */	if (!initialize_tcompress(hash_file, freq_file, TC_ERRORMSGS)) goto docleanup;	printf("Compressing files with new dictionary...\n");	/* Use the set of file-names collected during partition() / modified during build_hash */	for(i=0; i<file_num; i++) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -