📄 glimpse.c
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. *//* ./glimpse/index/glimpse.c */#include "glimpse.h"#include <stdlib.h>#include <sys/time.h>#if ISO_CHAR_SET#include <locale.h> /* support for 8bit character set:ew@senate.be */#endifextern char **environ;extern int errno;extern FILE *TIMEFILE; /* file descriptor for sorting .glimpse_filenames by time */#if BG_DEBUGextern FILE *LOGFILE; /* file descriptor for LOG output */#endif /*BG_DEBUG*/extern FILE *STATFILE; /* file descriptor for statistical data about indexed files */extern FILE *MESSAGEFILE; /* file descriptor for important messages meant for the user */extern char INDEX_DIR[MAX_LINE_LEN];extern struct stat istbuf;extern char *TEMP_DIR; /* directory to store glimpse temporary files, usually /tmp unless -T is specified */#ifdef BUILDCAST/* TEMP_DIR is normally defined in ../main.c; if we're building * buildcast, that's not linked in, so we need to define one here. *//* char * TEMP_DIR = NULL; */static char * TEMP_DIR = "/tmp";#endif /* BUILDCAST */extern int indexable_char[256];extern int GenerateHash;extern int KeepFilenames;extern int OneFilePerBlock;extern int IndexNumber;extern int CountWords;extern int StructuredIndex;extern int attr_num;extern int MAXWORDSPERFILE;extern int NUMERICWORDPERCENT;extern int AddToIndex;extern int DeleteFromIndex;extern int PurgeIndex;extern int FastIndex;extern int BuildDictionary;extern int BuildDictionaryExisting;extern int CompressAfterBuild;extern int IncludeHigherPriority;extern int FilenamesOnStdin;extern int ExtractInfo;extern int InfoAfterFilename;extern int FirstWordOfInfoIsKey;extern int UseFilters;extern int ByteLevelIndex;extern int RecordLevelIndex;extern int StoreByteOffset;extern char rdelim[MAX_LINE_LEN];extern char old_rdelim[MAX_LINE_LEN];extern int rdelim_len;/* extern int IndexUnderscore; */extern int IndexableFile;extern int MAX_PER_MB, MAX_INDEX_PERCENT;extern int I_THRESHOLD;extern int BigHashTable;extern int BigFilenameHashTable;extern int IndexEverything;extern int BuildTurbo;extern int SortByTime;extern int AddedMaxWordsMessage;extern int AddedMixedWordsMessage;extern int file_num;extern int old_file_num;extern int new_file_num;extern int file_id;extern int part_num;extern char **name_list[MAXNUM_INDIRECT];extern int p_table[MAX_PARTITION];extern int *size_list[MAXNUM_INDIRECT];extern int p_size_list[];extern unsigned int *disable_list;extern int memory_usage;extern int mask_int[];extern int REAL_PARTITION, REAL_INDEX_BUF, MAX_ALL_INDEX, FILEMASK_SIZE;extern struct indices *deletedlist;extern char sync_path[MAX_LINE_LEN];extern int ATLEASTONEFILE;extern set_usemalloc(); /* compress/misc.c */char IProgname[MAX_LINE_LEN];int ModifyFilenamesIndex = 0;/* * Has newnum crossed the boundary of an encoding? This is so rare that we * needn't optimize it by changing the format of the old index and reusing it. */cross_boundary(oldnum, newnum) int oldnum, newnum;{ int ret; if (oldnum <= 0) return 0; ret = ( ((oldnum <= MaxNum8bPartition) && (newnum > MaxNum8bPartition)) || ((oldnum <= MaxNum12bPartition) && (newnum > MaxNum12bPartition)) || ((oldnum <= MaxNum16bPartition) && (newnum > MaxNum16bPartition)) ); if (ret) fprintf(MESSAGEFILE, "Must change index format. Commencing fresh indexing...\n"); return ret;}determine_sync(){ char S[1024], s1[256], s2[256]; FILE *fp; int i, ret; strcpy(sync_path, "sync"); sprintf(S, "exec whereis sync > %s/zz.%d", TEMP_DIR,getpid()); /* Change it to use which: not urgent. */ system(S); sprintf(S, "%s/zz.%d", TEMP_DIR,getpid()); if ((fp = fopen(S, "r")) == NULL) { /* printf("11111\n"); */ return 0; } if ((ret = fread(S, 1, sizeof(S)-1, fp)) <= 0) { sprintf(S, "%s/zz.%d", TEMP_DIR,getpid()); unlink(S); fclose(fp); /* printf("22222\n"); */ return 0; } S [ret] = 0; /* terminate string */ sprintf(s1, "%s/zz.%d", TEMP_DIR,getpid()); unlink(s1); fclose(fp); /* printf("read: %s\n", S); */ sscanf(S, "%s%s", s1, s2); /* printf("s1=%s s2=%s\n", s1, s2); */ if (strncmp(s1, "sync", 4)) { /* printf("33333\n"); */ return 0; } if (!strcmp(s2, "") || !strcmp(s2, " ")) { /* printf("44444\n"); */ return 0; } if (strstr(s2, "sync") == NULL) { /* printf("55555\n"); */ return 0; } strcpy(sync_path, s2); /* printf("Using sync in: %s\n", sync_path); */ return 1;}main(argc, argv)int argc;char **argv;{ int pid = getpid(); int i, m = 0; char *indexdir, es1[MAX_LINE_LEN], es2[MAX_LINE_LEN]; char s[MAX_LINE_LEN], s1[MAX_LINE_LEN]; char working_dir[MAX_LINE_LEN]; FILE *tmpfp; char hash_file[MAX_LINE_LEN], string_file[MAX_LINE_LEN], freq_file[MAX_LINE_LEN]; char tmpbuf[1024]; struct stat stbuf; char name[MAX_LINE_LEN]; char outname[MAX_LINE_LEN]; int specialwords, threshold; int backup; struct indices *get_removed_indices(); struct timeval tv;#if ISO_CHAR_SET setlocale(LC_ALL,""); /* support for 8bit character set: ew@senate.be, Henrik.Martin@eua.ericsson.se */#endif BuildDictionary = ON; set_usemalloc(); srand(pid); umask(077); determine_sync(); INDEX_DIR[0] = '\0'; specialwords = threshold = -1; /* so that compute_dictionary can use defaults not visible here */ strncpy(IProgname, argv[0], MAX_LINE_LEN); memset(size_list, '\0', sizeof(int *) * MAXNUM_INDIRECT); /* free it once partition successfully calculates p_size_list */ memset(name_list, '\0', sizeof(char **) * MAXNUM_INDIRECT); memset(p_size_list, '\0', sizeof(int) * MAX_PARTITION); build_filename_hashtable((char *)NULL, 0); /* * Process options. */ while (argc > 1) { if (strcmp(argv[1], "-help") == 0) { return usage(1); }#if !BUILDCAST else if (strcmp(argv[1], "-R") == 0) { ModifyFilenamesIndex = 1; argc --; argv ++; } else if (strcmp(argv[1], "-V") == 0) { printf("\nThis is glimpseindex version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE); return(0); } else if (strcmp(argv[1], "-T") == 0) { BuildTurbo = ON; argc --; argv ++; } else if (strcmp(argv[1], "-I") == 0) { IndexableFile = ON; argc --; argv ++; } else if(strcmp(argv[1], "-a") == 0) { AddToIndex = ON; argc--; argv++; } else if(strcmp(argv[1], "-b") == 0) { ByteLevelIndex = ON; argc--; argv++; } else if(strcmp(argv[1], "-O") == 0) { StoreByteOffset = ON; argc--; argv++; } else if(strcmp(argv[1], "-r") == 0) { ByteLevelIndex = ON; RecordLevelIndex = ON; if (argc <= 2) { fprintf(stderr, "The -r option must be followed by a delimiter\n"); return usage(1); } else { strncpy(rdelim, argv[2], MAX_LINE_LEN); rdelim[MAX_LINE_LEN-1] = '\0'; rdelim_len = strlen(rdelim); strcpy(old_rdelim, rdelim); argc -= 2; argv += 2; } } else if(strcmp(argv[1], "-c") == 0) { CountWords = ON; argc--; argv++; } else if(strcmp(argv[1], "-d") == 0) { DeleteFromIndex = ON; argc --; argv ++; } else if(strcmp(argv[1], "-D") == 0) { PurgeIndex = OFF; argc --; argv ++; } else if(strcmp(argv[1], "-f") == 0) { FastIndex = ON; argc--; argv++; } else if (strcmp(argv[1], "-o") == 0) { OneFilePerBlock = ON; argc --; argv ++; } else if (strcmp(argv[1], "-s") == 0) { StructuredIndex = ON; argc --; argv ++; } else if(strcmp(argv[1], "-z") == 0) { UseFilters = ON; argc--; argv++; } else if(strcmp(argv[1], "-t") == 0) { SortByTime = ON; argc--; argv++; } else if (strcmp(argv[1], "-C") == 0) { BigFilenameHashTable = 1; argc --; argv ++; }#else /*!BUILDCAST*/ else if (strcmp(argv[1], "-V") == 0) { printf("\nThis is buildcast version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE); return(0); } else if(strcmp(argv[1], "-C") == 0) { CompressAfterBuild = ON; argc --; argv ++; } else if(strcmp(argv[1], "-E") == 0) { BuildDictionaryExisting = ON; argc --; argv ++; } else if (strcmp(argv[1], "-t") == 0) { if ((argc <= 2) || !(isdigit(argv[2][0]))) { return usage(1); } else { threshold = atoi(argv[2]); argc -= 2; argv += 2; } } else if (strcmp(argv[1], "-l") == 0) { if ((argc <= 2) || !(isdigit(argv[2][0]))) { return usage(1); } else { specialwords = atoi(argv[2]); argc -= 2; argv += 2; } }#endif /*!BUILDCAST*/ else if (strcmp(argv[1], "-M") == 0) { if (argc == 2) { fprintf(stderr, "-M should be followed by the amount of memory in MB for indexing words\n"); return usage(1); } m = atoi(argv[2]); if (m < 1) { fprintf(stderr, "Ignoring -M %d (< 1 MB). Using default value of about 2 MB\n", m); return usage(1); } else { /* * Calculate I_THRESHOLD approximately. Note: 2*1024*1024*2 / (2*24 + 32 + 12) = 47662, DEF_I_THRESHOLD = 40000, so OK * N * sizeofindices + N*(avgwordlen + sizeoftoken)/indicespertoken <= mem * elemsperset = occurrences/indicespertoken * N <= mem * occurrences / (sizeofindices*indicespertoken + avgwordlen + sizeoftoken) */ I_THRESHOLD = m * 1024 * 1024 * (INDICES_PER_TOKEN) / (INDICES_PER_TOKEN * sizeof(struct indices) + sizeof(struct token) + AVG_WORD_LEN); fprintf(stderr, "Using %d words as threshold before merge\n", I_THRESHOLD/INDICES_PER_TOKEN); } argc -= 2; argv += 2; } else if (strcmp(argv[1], "-w") == 0) { if (argc == 2) { fprintf(stderr, "-w should be followed by the number of words\n"); return usage(1); } MAXWORDSPERFILE = atoi(argv[2]); argc -= 2; argv += 2; } else if (strcmp(argv[1], "-S") == 0) { if (argc == 2) { fprintf(stderr, "-S should be followed by the stop list limit\n"); return usage(1); } MAX_PER_MB = MAX_INDEX_PERCENT = atoi(argv[2]); argc -= 2; argv += 2; } else if(strcmp(argv[1], "-n") == 0) { IndexNumber = ON; if ((argc <= 2) || !(isdigit(argv[2][0]))) { /* -n has no arg */ argc --; argv ++; } else { NUMERICWORDPERCENT = atoi(argv[2]); if ((NUMERICWORDPERCENT > 100) || (NUMERICWORDPERCENT < 0)) { fprintf(stderr, "The percentage of numeric words must be in [0..100]\n"); return usage(1); } argc-=2; argv+=2; } } else if(strcmp(argv[1], "-h") == 0) { /* I want to generate .glimpse_filehash and .glimpse_filehash_index */ GenerateHash = ON; argc --; argv ++; } else if(strcmp(argv[1], "-i") == 0) { IncludeHigherPriority = ON; argc --; argv ++; } else if(strcmp(argv[1], "-k") == 0) { /* I want to know what files were there before: used in SFS to compute new sets from old ones */ KeepFilenames = ON; argc --; argv ++; } else if (strcmp(argv[1], "-B") == 0) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -