📄 glimpse.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
	    if ((disable_list != NULL) && (disable_list[block2index(i)] & mask_int[i%(8*sizeof(int))])) continue;	/* nop since disable_list IS NULL */	    strcpy(name, LIST_GET(name_list, i));	    tcompress_file(name, outname, TC_REMOVE | TC_EASYSEARCH | TC_OVERWRITE | TC_NOPROMPT);	}    }docleanup:    /* Restore old search-dictionaries */    sprintf(s, "%s/.glimpse_tempdir.%d/.glimpse_index", INDEX_DIR, pid);    if (!access(s, R_OK)) {#if	SFS_COMPAT	sprintf(s1, "%s/%s", INDEX_DIR, INDEX_FILE);	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, INDEX_FILE);	rename(s, s1);	sprintf(s1, "%s/%s", INDEX_DIR, P_TABLE);	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, P_TABLE);	rename(s, s1);	sprintf(s1, "%s/%s", INDEX_DIR, NAME_LIST);	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, NAME_LIST);	rename(s, s1);	sprintf(s1, "%s/%s", INDEX_DIR, NAME_LIST_INDEX);	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, NAME_LIST_INDEX);	rename(s, s1);	sprintf(s1, "%s/%s", INDEX_DIR, NAME_HASH);	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, NAME_HASH);	rename(s, s1);	sprintf(s1, "%s/%s", INDEX_DIR, NAME_HASH_INDEX);	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, NAME_HASH_INDEX);	rename(s, s1);	sprintf(s1, "%s/%s", INDEX_DIR, MINI_FILE);	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, MINI_FILE);	rename(s, s1);	sprintf(s1, "%s/%s", INDEX_DIR, DEF_STAT_FILE);	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, DEF_STAT_FILE);	rename(s, s1);	sprintf(s1, "%s/%s", INDEX_DIR, ATTRIBUTE_FILE);	sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, ATTRIBUTE_FILE);	rename(s, s1);#else	/* sprintf(s, "exec %s -f %s/.glimpse_tempdir.%d/.glimpse_* %s\n", SYSTEM_MV, INDEX_DIR, pid, INDEX_DIR); */	sprintf(s, "exec %s -f '%s/.glimpse_tempdir.%d/%s' '%s'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), pid, escapesinglequote(INDEX_FILE, es2), INDEX_DIR);	system(s);	sprintf(s, "exec %s -f '%s/.glimpse_tempdir.%d/%s' '%s'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), pid, P_TABLE, escapesinglequote(INDEX_DIR, es2));	system(s);	sprintf(s, "exec %s -f '%s/.glimpse_tempdir.%d/%s' '%s'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), pid, NAME_LIST, escapesinglequote(INDEX_DIR, es2));	system(s);	sprintf(s, "exec %s -f '%s/.glimpse_tempdir.%d/%s' '%s'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), pid, NAME_LIST_INDEX, escapesinglequote(INDEX_DIR, es2));	system(s);	sprintf(s, "exec %s -f '%s/.glimpse_tempdir.%d/%s' '%s'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), pid, NAME_HASH, escapesinglequote(INDEX_DIR, es2));	system(s);	sprintf(s, "exec %s -f '%s/.glimpse_tempdir.%d/%s' '%s'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), pid, NAME_HASH_INDEX, escapesinglequote(INDEX_DIR, es2));	system(s);	sprintf(s, "exec %s -f '%s/.glimpse_tempdir.%d/%s' '%s'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), pid, MINI_FILE, escapesinglequote(INDEX_DIR, es2));	system(s);	sprintf(s, "exec %s -f '%s/.glimpse_tempdir.%d/%s' '%s'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), pid, DEF_STAT_FILE, escapesinglequote(INDEX_DIR, es2));	system(s);	sprintf(s, "exec %s -f '%s/.glimpse_tempdir.%d/%s' '%s'\n", SYSTEM_MV, escapesinglequote(INDEX_DIR, es1), pid, ATTRIBUTE_FILE, escapesinglequote(INDEX_DIR, es2));	system(s);#endif	sprintf(s, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);	rmdir(s);    }    printf("\nBuilt new cast-dictionary in %s\n", INDEX_DIR);#else	/*BUILDCAST*/    if (AddToIndex || DeleteFromIndex || FastIndex) {	/* Not handling byte level indices here for now */	int	indextype = 0, indexnumber = OFF, structuredindex = OFF, recordlevelindex = OFF, temp_attr_num = 0, bytelevelindex = OFF;	char	temp_rdelim[MAX_LINE_LEN];	sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE);	if (-1 == stat(s, &istbuf)) {	    if (AddToIndex || DeleteFromIndex) {		fprintf(stderr, "Cannot find previous index %s! Fresh indexing recommended\n", s);		return usage(0);	    }	    file_num = 0;	    file_id = 0;	    part_num = 1;	    goto fresh_indexing;	}	/* Find out existing index of words and partitions/filenumbers */	if ((indextype = get_index_type(s, &indexnumber, &indextype, &structuredindex, temp_rdelim)) < 0) {#if	0	    fprintf(stderr, "Fresh indexing recommended: -a and -f are not supported with -b as yet\n");	    exit(1);	    /* we support it now */#endif	}	if (structuredindex == -2) {	    recordlevelindex = 1;	    bytelevelindex = 1;	}	if (structuredindex <= 0) structuredindex = 0;	else {	    temp_attr_num = structuredindex;	    structuredindex = 1;	}	file_num = part_num = 0;	sprintf(s, "%s/%s", INDEX_DIR, NAME_LIST);	file_num = get_array_of_lines(s, name_list, MaxNum24bPartition, 1);	initialize_disable_list(file_num);	initialize_data_structures(file_num);	if (!indextype) {		sprintf(s, "%s/%s", INDEX_DIR, P_TABLE);		part_num = get_table(s, p_table, MAX_PARTITION, 1) - 1;	/* part_num INCLUDES last partition */	}	else merge_splits();	/* Check for errors, Set OneFilePerBlock */	if ( (file_num <= 0) || (!indextype && (part_num <= 0)) ) {	    if (AddToIndex || DeleteFromIndex) {		fprintf(stderr, "Cannot find previous glimpseindex files! Fresh indexing recommended\n");		return usage(0);	    }	    file_num = 0;	    file_id = 0;	    part_num = 1;	    my_free(disable_list);	    disable_list = NULL;	    goto fresh_indexing;	}	if (OneFilePerBlock && !indextype) {	    fprintf(stderr, "Warning: ignoring option -o: using format of existing index\n");	    OneFilePerBlock = 0;	    ByteLevelIndex = 0;	}	else {	    OneFilePerBlock = abs(indextype);	    if (indextype < 0) ByteLevelIndex = ON;	}	if (StructuredIndex && !structuredindex) {	    fprintf(stderr, "Warning: ignoring option -s: using format of existing index\n");	    StructuredIndex = 0;	    attr_num = 0;	}	else {	    StructuredIndex = structuredindex;	    attr_num = temp_attr_num;	}	if (RecordLevelIndex && !recordlevelindex) {	    fprintf(stderr, "Warning: ignoring option -r: using format of existing index\n");	    RecordLevelIndex = 0;	    ByteLevelIndex = 0;	    rdelim[0] = '\0';	    old_rdelim[0] = '\0';	    rdelim_len = 0;	}	else {	    RecordLevelIndex = recordlevelindex;	    strcpy(old_rdelim, temp_rdelim);	    strcpy(rdelim, old_rdelim);	    rdelim_len = strlen(rdelim);	    preprocess_delimiter(rdelim, rdelim_len, rdelim, &rdelim_len);	}	/* Used in FastIndex for all existing files, used in AddToIndex/DeleteFromIndex if we are trying to add/remove an existing file */	build_filename_hashtable(name_list, file_num);#if	0	/* Test if these are inverses of each other */	save_data_structures();	merge_splits();#endif	/*0*/	/*	 * FastIndex: set disable-flag for unchanged files: remove AND	 * disable non-existent files. Let hole remain in file-names/partitions.	 */	if (FastIndex) {	    for (i=0; i<file_num; i++)		if (-1 == my_stat(LIST_GET(name_list, i), &stbuf)) {			remove_filename(i, -1);		}		else if (((stbuf.st_mode & S_IFMT) == S_IFREG) && (stbuf.st_ctime <= istbuf.st_ctime)) {		    /* This is just used as a cache since exclude/include processing is not done here: see dir.c */		    disable_list[block2index(i)] |= mask_int[i % (8*sizeof(int))];		}		else {		    /* Can't do it for directories since files in it can be modified w/o date reflected in the directory. Same for symlinks. */		    LIST_ADD(size_list, i, stbuf.st_size, int);		    disable_list[block2index(i)] &= ~(mask_int[i % (8*sizeof(int))]);		}	}	/*	 * AddToIndex without FastIndex: disable all existing files, remove those that don't exist now.	 * Out of old ones, only ADDED FILES are re-enabled: dir.c	 */	else if (AddToIndex) {	    for (i=0; i<file_num; i++) {		if (-1 == my_stat(LIST_GET(name_list, i), &stbuf)) {		    remove_filename(i, -1);		}		else {		    LIST_ADD(size_list, i, stbuf.st_size, int);	/* ONLY for proper statistics in save_data_structures() */		    disable_list[block2index(i)] |= mask_int[i % (8*sizeof(int))];		}	    }	}	/* else: DeleteFromIndex without FastIndex: don't touch other files */	old_file_num = file_num;	destroy_data_structures();	/* Put old/new files into partitions/filenumbers */	if (-1 == oldpartition(argc, argv)) {	    for(i=0;i<file_num;i++) {#if	BG_DEBUG		memory_usage -= (strlen(LIST_GET(name_list, i)) + 2);#endif	/*BG_DEBUG*/		if (LIST_GET(name_list, i) != NULL) {			my_free(LIST_GET(name_list, i), 0);			LIST_SUREGET(name_list, i) = NULL;		}	    }	    file_num = 0;	    file_id = 0;	    for (i=0;i<part_num; i++) {		p_table[i] = 0;	    }	    part_num = 1;	    my_free(disable_list);	    disable_list = NULL;	    goto fresh_indexing;	}	/* Reindex all the files but use the file-names obtained with oldpartition() */	if (cross_boundary(OneFilePerBlock, file_num)) {	    my_free(disable_list);	    disable_list = NULL;	}	initialize_data_structures(file_num);	if (!DeleteFromIndex || FastIndex) build_index();	if ((deletedlist = get_removed_indices()) == NULL) new_file_num = file_num;	else if (PurgeIndex) new_file_num = purge_index();#if	BG_DEBUG	fprintf(LOGFILE, "Built indices in %s/%s\n", INDEX_DIR, INDEX_FILE);#endif	/*BG_DEBUG*/	goto docleanup;    }fresh_indexing:    /* remove it to create space since it can be large: don't need for fresh indexing */    sprintf(s, "%s/%s", INDEX_DIR, P_TABLE);    unlink(s);    /* These should be zeroed since they can confuse fsize and fsize_directory() */    AddToIndex = 0;    FastIndex = 0;#if	BG_DEBUG    fprintf(LOGFILE, "Commencing fresh indexing\n");#endif	/*BG_DEBUG*/    partition(argc, argv);    destroy_filename_hashtable();    initialize_data_structures(file_num);    old_file_num = file_num;    build_index();#if	BG_DEBUG    fprintf(LOGFILE, "\nBuilt indices in %s/%s\n", INDEX_DIR, INDEX_FILE);#endif	/*BG_DEBUG*/docleanup:    cleanup();    save_data_structures();    destroy_filename_hashtable();#if	BG_DEBUG    fflush(LOGFILE);    fclose(LOGFILE);#endif	/*BG_DEBUG*/    fflush(MESSAGEFILE);    fclose(MESSAGEFILE);    fflush(STATFILE);    fclose(STATFILE);    if (AddedMaxWordsMessage) printf("\nSome files contributed > %d words to the index: check %s\n", MAXWORDSPERFILE, DEF_MESSAGE_FILE);    if (AddedMixedWordsMessage) printf("Some files had numerals in > %d%% of the indexed words: check %s\n", NUMERICWORDPERCENT, DEF_MESSAGE_FILE);    printf("\nIndex-directory: \"%s\"\nGlimpse-files created here:\n", INDEX_DIR);    chdir(INDEX_DIR);    sprintf(s, "exec %s -l .glimpse_* > %s/%d\n", SYSTEM_LS, TEMP_DIR,pid);    system(s);    sprintf(s, "%s/%d", TEMP_DIR,pid);    if ((tmpfp = fopen(s, "r")) != NULL) {	memset(tmpbuf, '\0', 1024);	while(fgets(tmpbuf, 1024, tmpfp) != NULL) fputs(tmpbuf, stdout);	fflush(tmpfp);	fclose(tmpfp);	unlink(s);    }    else fprintf(stderr, "cannot open %s to `cat': check %s for .glimpse - files\n", s, INDEX_DIR);#endif	/*BUILDCAST*/    if (!ATLEASTONEFILE) exit(1);    return 0;}cleanup(){    char s[MAX_LINE_LEN];    sprintf(s, "%s/%s", INDEX_DIR, I1);    unlink(s);    sprintf(s, "%s/%s", INDEX_DIR, I2);    unlink(s);    sprintf(s, "%s/%s", INDEX_DIR, I3);    unlink(s);    sprintf(s, "%s/%s", INDEX_DIR, O1);    unlink(s);    sprintf(s, "%s/%s", INDEX_DIR, O2);    unlink(s);    sprintf(s, "%s/%s", INDEX_DIR, O3);    unlink(s);    sprintf(s, "%s/.glimpse_apply.%d", INDEX_DIR, getpid());    unlink(s);}#if	!BUILDCASTusage(flag)int	flag;{	if (flag) fprintf(stderr, "\nThis is glimpseindex version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);	fprintf(stderr, "usage: %s [-help] [-a] [-d] [-f] [-i] [-n [X]] [-o] [-r delim] [-s] [-t] [-w X] [-B] [-F] [-H DIR] [-I] [-M X] [-R] [-S X] [-T] [-V] NAMES\n", IProgname);	fprintf(stderr, "List of options (see %s for more details):\n", GLIMPSE_URL);	fprintf(stderr, "-help: outputs this menu\n");	fprintf(stderr, "-a: add given files/directories to an existing index\n");	fprintf(stderr, "-b: build a (large) byte-level index \n");	fprintf(stderr, "-B: use a hash table that is 4 times bigger (256k entries instead of 64K) \n");	fprintf(stderr, "-d NAMES: delete (file or directory) NAMES from an existing index\n");	fprintf(stderr, "-D NAMES: delete NAMES from the list of files (but not from the index!)\n");	fprintf(stderr, "-E: do not run a check on file types\n");	fprintf(stderr, "-f: incremental indexing (add all newly modified files)\n");	fprintf(stderr, "-F: the list of files to index is obtained from standard input\n");	fprintf(stderr, "-h: generates some hash-tables for WebGlimpse\n");	fprintf(stderr, "-H DIR: the index is put in directory DIR\n");	fprintf(stderr, "-i: make .glimpse_include take precedence over .glimpse_exclude\n");	fprintf(stderr, "-I: output the list of files that would be indexed (but don't index)\n");	fprintf(stderr, "-M X: use X MBytes of memory for temporary tables\n");        fprintf(stderr, "-n [X]: index numbers as well as words; warn (into .glimpse_messages)\n\tif file adds > X%% numeric words: default is %d\n", DEF_NUMERIC_WORD_PERCENT);	fprintf(stderr, "-o: build a small (rather than tiny) size index (the recommended option!)\n");	/*fprintf(stderr, "-O: when using -r option, store byte offset of each record,\n\tinstead of the record number, for faster access\n");*/	fprintf(stderr, "-r delim: build an index at the granularity of delimiter `delim'\n\tto do booleans by reading ONLY the index\n");	fprintf(stderr, "-R: recompute .glimpse_filenames_index from .glimpse_filenames if it changes\n");	fprintf(stderr, "-s: build index to support structured (Harvest SOIF type) queries\n");	fprintf(stderr, "-S X: adjust the size of the stop list\n");	fprintf(stderr, "-t: sort the indexed files by date and time (most recent first)\n");	fprintf(stderr, "-T: build .glimpse_turbo for very fast search with -i -w in glimpse\n");	fprintf(stderr, "-U: there is extra information after filenames: works only with -F\n");	fprintf(stderr, "-w X: warn (into .glimpse_messages) if a file adds >= X words to the index\n");	fprintf(stderr, "-X: extract titles of all documents with .html, .htm, .shtm, .shtml suffix\n");	fprintf(stderr, "-z: customizable filtering using .glimpse_filters \n");	fprintf(stderr, "\n");	fprintf(stderr, "For questions about glimpse, please contact: `%s'\n", GLIMPSE_EMAIL);	exit(1);}#else	/*!BUILDCAST*/usage(flag)int	flag;{	if (flag) fprintf(stderr, "\nThis is buildcast version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);	fprintf(stderr, "usage: %s [-help] [-t] [-i] [-l] [-n [X]] [-w X] [-C] [-E] [-F] [-H DIR] [-V] NAMES\n", IProgname);        fprintf(stderr, "summary of frequently used options\n(for a more detailed listing see 'man cast'):\n");	fprintf(stderr, "-help: output this menu\n");        fprintf(stderr, "-n [X]: index numbers as well as words; warn (into .glimpse_messages)\n\tif file adds > X%% numeric words: default is %d\n", DEF_NUMERIC_WORD_PERCENT);        fprintf(stderr, "-w X: warn if a file adds > X words to the index\n");	fprintf(stderr, "-C: compress files with the new dictionary after building it\n");	fprintf(stderr, "-E: build cast dictionary using existing compressed files only\n");	fprintf(stderr, "-F: expect filenames on stdin (useful for pipelining)\n");        fprintf(stderr, "-H DIR: .glimpse-files should be in directory DIR: default is '~'\n");	fprintf(stderr, "\n");	fprintf(stderr, "For questions about glimpse, please contact: `%s'\n", GLIMPSE_EMAIL);	exit(1);}#endif	/*!BUILDCAST*/
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -