⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 partition.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. *//* ./glimpse/index/partition.c */#include "glimpse.h"#include <sys/stat.h>#include <sys/time.h>extern int BigFilenameHashTable;extern int DeleteFromIndex;extern int FastIndex;extern int FilenamesOnStdin;extern char INDEX_DIR[MAX_LINE_LEN];extern char sync_path[MAX_LINE_LEN];extern int file_num;	/* the number of files */extern int new_file_num; /* the new number of files after purging some from index */extern char **name_list[MAXNUM_INDIRECT]; /* to store the file names */extern int  *size_list[MAXNUM_INDIRECT]; /* store size of each file */extern int  p_table[MAX_PARTITION];	/* partition table, the i-th partition begins at p_table[i] and ends at p_tables[i+1] */extern int  p_size_list[MAX_PARTITION];	/* sum of the sizes of the files in each partition */extern int  part_num;	/* number of partitions, 1 initially since partition # 0 is not accessed */extern int built_filename_hashtable;extern name_hashelement *name_hashtable[MAX_64K_HASH];extern int total_size; /* total size of the directory */extern int total_deleted; /* number of files being deleted */int  part_size=DEFAULT_PART_SIZE;	/* partition size */int  new_partition;int  files_per_partition;int  files_in_partition;int  ATLEASTONEFILE = 0;extern int errno;char patbuf[MAX_PAT];extern unsigned char *src_index_buf;extern unsigned char *dest_index_buf;extern int REAL_PARTITION, REAL_INDEX_BUF, MAX_ALL_INDEX, FILEMASK_SIZE;extern int memory_usage;extern struct indices	*deletedlist;extern  FILE	*TIMEFILE;extern  FILE	*STATFILE;extern  FILE	*MESSAGEFILE;extern struct stat excstbuf;extern struct stat incstbuf;extern int GenerateHash;extern int KeepFilenames;extern int OneFilePerBlock;extern int ByteLevelIndex;extern int RecordLevelIndex;extern int rdelim_len;extern char rdelim[MAX_LINE_LEN];extern char old_rdelim[MAX_LINE_LEN];extern int StructuredIndex;extern int attr_num;extern char INDEX_DIR[MAX_LINE_LEN];extern int AddToIndex;extern int IndexableFile;extern int BuildTurbo;extern int SortByTime;char *exin_argv[8];int exin_argc;char current_dir_buf[2*MAX_LINE_LEN + 4];	/* must have space to store pattern after directory name */unsigned char dummypat[MAX_PAT];int dummylen;FILE *dummyout;partition(dir_num, dir_name)char **dir_name;int  dir_num;{    int num_pat=0;    int num_inc=0;    int len;    long thetime;    long prevtime;    int theindex;    int firsttime = 1;    int xx;    struct timeval tv;    FILE *tmp_TIMEFILE;    FILE *index_TIMEFILE;    int ret;    char **temp_name_list;    int *temp_size_list;    int temp_file_num;    char S[MAX_LINE_LEN], S1[MAX_LINE_LEN], es1[MAX_LINE_LEN], es2[MAX_LINE_LEN], es3[MAX_LINE_LEN];    int  pat_len[MAX_EXCLUSIVE];    int  inc_len[MAX_EXCLUSIVE];    CHAR *inc[MAX_INCLUSIVE];	/* store the patterns used to mask in files */    CHAR *pat[MAX_EXCLUSIVE];     /* store the patterns that are used to				     mask out those files that are not to				     be indexed  */    int MinPartNum; 		/* minimum number of partitions */    int i=0, j;    int subtotal=0;    int pdx = 0; 			/* index pointer for p_table */    FILE *patfile; 	/* file descriptor for prohibit pattern file */    FILE *incfile;	/* file descriptor for include pattern file */    char *current_dir;	/* must have '\n' before directory name */    char s[MAX_LINE_LEN];    char working_dir[MAX_LINE_LEN];    struct stat sbuf;    current_dir_buf[0] = '\n';    current_dir_buf[1] = '\0';    current_dir = &current_dir_buf[1];    /* if (IndexableFile) goto directlytofsize; */    if ((dummyout = fopen("/dev/null", "w")) == NULL) return -1;    exin_argv[0] = "glimpseindex";    exin_argv[1] = "dummypat";    exin_argc = 2;    if ((dummylen = memagrep_init(exin_argc, exin_argv, MAX_PAT, dummypat)) <= 0) return -1;	/* exclude/include pattern search */    sprintf(s, "%s/%s", INDEX_DIR, PROHIBIT_LIST);    patfile = fopen(s, "r");    if(patfile == NULL) {	/* fprintf(stderr, "can't open exclude-pattern file\n"); -- no need! */	num_pat = 0;    }    else {	while((num_pat < MAX_EXCLUSIVE) && fgets(patbuf, MAX_PAT, patfile)) {		if ((len = strlen(patbuf)) < 1) continue;		patbuf[len-1] = '\0';		if ((pat_len[num_pat] = convert2agrepregexp(patbuf, len-1)) == 0) continue;		pat[num_pat++] = (unsigned char *) strdup(patbuf);	}	fclose(patfile);    }#if	0    printf("num_pat %d\n", num_pat);    for(i=0; i<num_pat; i++) printf("len=%d pat=%s\n", pat_len[i], pat[i]);    printf("memagrep=%d\n", memagrep_search(-pat_len[0], pat[0], 17, "\n.glimpse_index\nasdfk", 0, stdout));#endif    sprintf(s, "%s/%s", INDEX_DIR, INCLUDE_LIST);    incfile = fopen(s, "r");    if(incfile == NULL) {	/* fprintf(stderr, "can't open include-pattern file\n"); -- no need! */	num_inc = 0;    }    else {	while((num_inc < MAX_INCLUSIVE) && fgets(patbuf, MAX_PAT, incfile)) {		if ((len = strlen(patbuf)) < 1) continue;		patbuf[len-1] = '\0';		if ((inc_len[num_inc] = convert2agrepregexp(patbuf, len-1)) == 0) continue;		inc[num_inc++] = (unsigned char *) strdup(patbuf);	}	fclose(incfile);    }#if	0    printf("num_inc %d\n", num_inc);    for(i=0; i<num_inc; i++) printf("len=%d inc=%s\n", inc_len[i], inc[i]);#endif#ifdef	SW_DEBUG    printf("dir_num = %d", dir_num-1);#endifdirectlytofsize:    if ((dir_num <= 1) && (FilenamesOnStdin)) while (fgets(current_dir, MAX_LINE_LEN, stdin) == current_dir) {	current_dir[strlen(current_dir)-1] = '\0';	/* overwrite \n with \0 */	/* Get absolute path name of the directory or file being indexed */	if (-1 == my_stat(current_dir, &sbuf)) {		fprintf(stderr, "permission denied or non-existent: %s\n", current_dir);		continue;	}	if ((S_ISDIR(sbuf.st_mode)) && (current_dir[0] != '/')) {	    getcwd(working_dir, MAX_LINE_LEN - 1);	    if (-1 == chdir(current_dir)) {		fprintf(stderr, "Cannot chdir to %s\n", current_dir);		continue;	    }	    getcwd(current_dir, MAX_LINE_LEN - 1);	    chdir(working_dir);	}	if (!IndexableFile) printf("Indexing \"%s\" ...\n", current_dir);    	fsize(current_dir, pat, pat_len, num_pat, inc, inc_len, num_inc, 0); /* the file names will be in name_list[]: NOT TOP LEVEL!!! Mar/11/96 */    }    else for(i=1; i<dir_num; i++) {    	strcpy(current_dir, dir_name[i]);	/* Get absolute path name of the directory or file being indexed */	if (-1 == my_stat(current_dir, &sbuf)) {		fprintf(stderr, "permission denied or non-existent: %s\n", current_dir);		continue;	}	if ((S_ISDIR(sbuf.st_mode)) && (current_dir[0] != '/')) {	    getcwd(working_dir, MAX_LINE_LEN - 1);	    if (-1 == chdir(current_dir)) {		fprintf(stderr, "Cannot chdir to %s\n", current_dir);		continue;	    }	    getcwd(current_dir, MAX_LINE_LEN - 1);	    chdir(working_dir);	}	if (!IndexableFile) {		if (!DeleteFromIndex) printf("Indexing \"%s\" ...\n", current_dir);	}    	fsize(current_dir, pat, pat_len, num_pat, inc, inc_len, num_inc, 1); /* the file names will be in name_list[] */    }    if (IndexableFile) return 0;    /*     * If -t option is set, we must sort .glimpse_filenames (i.e., name_list and size_list) according to the most recent modification date.     * The file-number VS its modification date are already available in .glimpse_filetimes which is created in main() and filled in fsize().     */    if (SortByTime && (file_num > 0)) {	fflush(TIMEFILE);	fclose(TIMEFILE);#if	USESORT_Z_OPTION#if	DONTUSESORT_T_OPTION || SFS_COMPAT	sprintf(S, "exec %s -n -r -z %d '%s/%s' > '%s/%s.tmp'\n", SYSTEM_SORT, maxsortlinelen, escapesinglequote(INDEX_DIR, es1), DEF_TIME_FILE, escapesinglequote(INDEX_DIR, es2), DEF_TIME_FILE);#else	sprintf(S, "exec %s -n -r -T '%s' -z %d '%s/%s' > '%s/%s.tmp'\n", SYSTEM_SORT, escapesinglequote(INDEX_DIR, es1), maxsortlinelen, escapesinglequote(INDEX_DIR, es2), DEF_TIME_FILE, escapesinglequote(INDEX_DIR, es3), DEF_TIME_FILE);#endif#else#if	DONTUSESORT_T_OPTION || SFS_COMPAT	sprintf(S, "exec %s -n -r '%s/%s' > '%s/%s.tmp'\n", SYSTEM_SORT, escapesinglequote(INDEX_DIR, es1), DEF_TIME_FILE, escapesinglequote(INDEX_DIR, es2), DEF_TIME_FILE);#else	sprintf(S, "exec %s -n -r -T '%s' '%s/%s' > '%s/%s.tmp'\n", SYSTEM_SORT, escapesinglequote(INDEX_DIR, es1), escapesinglequote(INDEX_DIR, es2), DEF_TIME_FILE, escapesinglequote(INDEX_DIR, es3), DEF_TIME_FILE);#endif#endif#ifdef BG_DEBUG	printf("%s", S);#endif	if((ret=system(S)) != 0) {	    sprintf(S1, "system('%s') failed at:\n\t File=%s, Line=%d, Errno=%d", S, __FILE__, __LINE__, errno);	    perror(S1);	    fprintf(stderr, "Please try to run the program again\n(If there's no memory, increase the swap area / don't use -M and -B options)\n");	    sprintf(S, "%s/%s", INDEX_DIR, DEF_TIME_FILE);	    unlink(S);	    exit(2);	}	sprintf(S, "%s/%s.tmp", INDEX_DIR, DEF_TIME_FILE);	if ((tmp_TIMEFILE = fopen(S, "r")) == NULL) {	    fprintf(stderr, "can't open %s for reading\n", S);	    unlink(S);	    exit(2);	}	sprintf(S, "%s/%s", INDEX_DIR, DEF_TIME_FILE);	if ((TIMEFILE = fopen(S, "w")) == NULL) {	    fprintf(stderr, "can't open %s for writing\n", S);	    unlink(S);	    exit(2);	}	sprintf(S, "%s/%s.index", INDEX_DIR, DEF_TIME_FILE);	if ((index_TIMEFILE = fopen(S, "w")) == NULL) {	    fprintf(stderr, "can't open %s for writing\n", S);	    unlink(S);	    exit(2);	}	/* Get the sorted times from .glimpse_times.tmp; dump the exact times for each file in .glimpse_times; dump per-day file# in .glimpse_times.index */	gettimeofday(&tv, NULL);	temp_file_num = 0;	temp_name_list = (char **)my_malloc(sizeof(char *) * file_num);	memset(temp_name_list, '\0', sizeof(char *) * file_num);	temp_size_list = (int *)my_malloc(sizeof(int) * file_num);	memset(temp_size_list, '\0', sizeof(int) * file_num);	prevtime = tv.tv_sec;	while (fscanf(tmp_TIMEFILE, "%ld %d", &thetime, &theindex) == 2) {	    temp_name_list[temp_file_num] = LIST_GET(name_list, theindex);	    temp_size_list[temp_file_num] = LIST_GET(size_list, theindex);	    for (xx=0; xx<sizeof(long); xx++) fputc((thetime & ((0xff) << (8*(sizeof(long) - xx - 1))))>>(8*(sizeof(long) - xx - 1)), TIMEFILE);	    /* fprintf(TIMEFILE, "%d %d\n", thetime, (prevtime - thetime)/86400); */	    if (firsttime) {		for (i=0; i<(prevtime - thetime + 86399)/86400; i++) {		    for (xx=0; xx<sizeof(int); xx++)			fputc((temp_file_num & ((0xff) << (8*(sizeof(int) - xx - 1))))>>(8*(sizeof(int) - xx - 1)), index_TIMEFILE);		    /* fprintf(index_TIMEFILE, "%d\n", temp_file_num); */		}	    }	    else {		for (i=0; i<(prevtime - thetime)/86400; i++) {		    for (xx=0; xx<sizeof(int); xx++)			fputc((temp_file_num & ((0xff) << (8*(sizeof(int) - xx - 1))))>>(8*(sizeof(int) - xx - 1)), index_TIMEFILE);		    /* fprintf(index_TIMEFILE, "%d\n", temp_file_num); */		}	    }	    temp_file_num ++;	    if (!firsttime) prevtime -= i*86400;	    else if (i>0) prevtime -= (i-1)*86400;	    firsttime = 0;	}	if (temp_file_num != file_num) {	    fprintf(stderr, "error in sort: File=%s, Line=%d\n", __FILE__, __LINE__);	    exit(2);	}	/* Change the lists to be sorted now; free temporary lists */	for (i=0; i<temp_file_num; i++) {	    LIST_SUREGET(name_list, i) = temp_name_list[i];	    LIST_SUREGET(size_list, i) = temp_size_list[i];	}	my_free(temp_name_list, sizeof(char *) * file_num);	my_free(temp_size_list, sizeof(int) * file_num);	fclose(tmp_TIMEFILE);	fflush(TIMEFILE);	fclose(TIMEFILE);	fflush(index_TIMEFILE);	fclose(index_TIMEFILE);	sprintf(S, "%s/%s.tmp", INDEX_DIR, DEF_TIME_FILE);	unlink(S);    }    for(i=0; i<file_num; i++) total_size += LIST_GET(size_list, i);    for(i=0; i<file_num; i++) if (LIST_GET(name_list, i) == NULL) total_deleted ++;    if (DeleteFromIndex) {	if (total_size <= 0) {	    fprintf(STATFILE, "#of files being deleted = %d, Total #of files = %d\n", total_deleted, file_num - total_deleted);	    printf("\n#of files being deleted = %d, Total #of files = %d\n", total_deleted, file_num - total_deleted);	/* the only output the user sees */	}	else {	    fprintf(STATFILE, "Size of files being indexed = %d B, #of files being deleted = %d, Total #of files = %d\n", total_size, total_deleted, file_num - total_deleted);	    printf("\nSize of files being indexed = %d B, #of files being deleted = %d, Total #of files = %d\n", total_size, total_deleted, file_num - total_deleted);	/* the only output the user sees */	}    }    else {	fprintf(STATFILE, "Size of files being indexed = %d B, Total #of files = %d\n", total_size, file_num);	printf("\nSize of files being indexed = %d B, Total #of files = %d\n", total_size, file_num);	/* the only output the user sees */    }#ifdef	SW_DEBUG    for (i=0; i<file_num; i++)	printf("name_list[%d] = %s, size=%d\n", i, LIST_GET(name_list, i), LIST_GET(size_list, i));#endif	/*SW_DEBUG*/    for (i=0; i<num_inc; i++) {#if	BG_DEBUG	memory_usage -= strlen(inc) + 2;#endif	/*BG_DEBUG*/	my_free(inc[i], 0);    }    for (i=0; i<num_pat; i++) {#if	BG_DEBUG	memory_usage -= strlen(pat) + 2;#endif	/*BG_DEBUG*/	my_free(pat[i], 0);    }    /* Life (algorithm) is much simpler, but encode/decode (I/O) is more complex: the p_table is irrelevant */    if (OneFilePerBlock)	return 0;    /* Now put the files into partitions */    i=0;    part_size = total_size / MaxNumPartition;    if (part_size <= 0) part_size = total_size;    LIST_ADD(size_list, file_num, part_size, int);    if (file_num / 2 <= 1) {	p_table[0] = 0;	p_table[1] = 0;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -