📄 get_index.c
字号:
if ((j >= MAX_LINE_LEN) || (s[j] == '\0') || (s[j] == '\n')) { curroffset = ftell(indexfp); continue; } /* else it is WORD_END_MARK or ALL_INDEX_MARK */ c = s[j]; s[j] = '\0'; cmp = strcmp(word, s);#if WORD_SORTED if (cmp < 0) break; /* since index is sorted by word */ else#endif /* WORD_SORTED */ if (cmp != 0) { /* not IDENTICALLY EQUAL */ s[j] = c; curroffset = ftell(indexfp); continue; } s[j] = c; fputs(s, outfp); num++; curroffset = ftell(indexfp); } return num;}/* Returns the number of times a successful search was conducted: unused info at present. */fillup_target(result_index_set, result_offset_table, parse) unsigned int *result_index_set; struct offsets **result_offset_table; long parse;{ int i=0; FILE *tmpfp; int dummylen = 0; char dummypat[MAX_PAT]; int successes = 0, ret; int first_time = 1; extern int veryfast; int prev_INVERSE = INVERSE; veryfast = veryfastsearch(index_argc, index_argv, num_pat, pat_list, pat_lens, minifp); while (i < num_pat) { if (!veryfast) { if (is_mgrep_pat[i] && (num_mgrep_pat > 1)) { /* do later */ i++; continue; } strcpy(index_argv[patindex], pat_list[i]); /* i-th pattern in its right position */ } /* printf("pat_list[%d] = %s\n", i, pat_list[i]); */ if ((tmpfp = fopen(tempfile, "w")) == NULL) { fprintf(stderr, "%s: cannot open for writing: %s, errno=%d\n", GProgname, tempfile, errno); return(-1); } errno = 0;/* do we need to check is_mgrep_pat[i] here? */ if (veryfast && is_mgrep_pat[i]) { ret = mini_agrep(pat_list[i], pat_lens[i], tmpfp); } /* If this is the glimpse server, since the process doesn't die, most of its data pages might still remain in memory */ else if ((ret = fileagrep(index_argc, index_argv, 0, tmpfp)) < 0) { /* reinitialization here takes care of agrep_argv changes AFTER split_pattern */ fprintf(stderr, "%s: error in searching index\n", HARVEST_PREFIX); fclose(tmpfp); return(-1); } /* Now, the output of index search is in tempfile: need to use files here since index is too large */ fflush(tmpfp); fclose(tmpfp); tmpfp = NULL; /* Keep track of the maximum number of errors: will never enter veryfast */ if (GBESTMATCH) { if (errno > bestmatcherrors) bestmatcherrors = errno; } /* At this point, all index-search options are properly set due to the above fileagrep */ INVERSE = prev_INVERSE; if (-1 ==get_index(tempfile, result_index_set, result_offset_table, pat_list[i], pat_lens[i], pat_attr[i], index_argv, index_argc, nullfp, partfp, parse, first_time)) return(-1); successes ++; first_time = 0; i++; } fflush(stderr); if (veryfast) return successes; /* For index-search with memgrep in mgrep_get_index, and get-filenames */ dummypat[0] = '\0'; if ((dummylen = memagrep_init(index_argc, index_argv, MAX_PAT, dummypat)) <= 0) return(-1); if (num_mgrep_pat > 1) { CHAR *old_buf = (CHAR *)index_argv[patbufpos]; /* avoid my_free and re-my_malloc */ index_argv[patbufpos] = (char*)pat_buf; /* this contains all the patterns with the right -m and -M options */#if BG_DEBUG fprintf(debug, "pat_buf = %s\n", pat_buf);#endif /*BG_DEBUG*/ strcpy(index_argv[patindex], "-z"); /* no-op: patterns are in patbufpos; also avoid shift-left of index_argv */ if (-1 == mgrep_get_index(tempfile, result_index_set, result_offset_table, pat_list, pat_lens, pat_attr, mgrep_pat_index, num_mgrep_pat, patbufpos, index_argv, index_argc, nullfp, partfp, parse, first_time)) { index_argv[patbufpos] = (char *)old_buf; /* else will my_free array! */ fprintf(stderr, "%s: error in searching index\n", HARVEST_PREFIX); return(-1); } successes ++; first_time = 0; index_argv[patbufpos] = (char *)old_buf; } return successes;}/* * Now, I search the index by doing an in-order traversal of the boolean parse tree starting at GParse. * The results at each node are stored in src_offset_table and src_index_set. Before the right child is * evaluated, results of the left child are stored in curr_offset_table and curr_index_set (accumulators) * and are unioned/intersected/noted with the right child's results (which get stored in src_...) and * passed on above. The accumulators are allocated at each internal node and freed after evaluation. * Left to right evaluation is good since number of curr_offset_tables that exist simultaneously depends * entirely on the maximum depth of a right branch (REAL_PARTITION is small so it won't make a difference). */intsearch_index(tree) ParseTree *tree;{ int prev_INVERSE; int i, j, iii; int first_time = 0; /* since it is used AFTER left child has been computed */ unsigned int *curr_index_set = NULL; struct offsets **curr_offset_table = NULL; if (ComplexBoolean) { /* recursive */ if (tree == NULL) return -1; if (tree->type == LEAF) { /* always AND pat of individual words at each term: initialize accordingly */ if (OneFilePerBlock) { for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) src_index_set[i] = 0xffffffff; } else for(i=0; i<MAX_PARTITION; i++) src_index_set[i] = 1; src_index_set[REAL_PARTITION - 1] = 0; src_index_set[REAL_PARTITION - 2] = 0; if (split_terminal(tree->terminalindex, tree->terminalindex+1) <= 0) return -1; prev_INVERSE = INVERSE; /* agrep's global to implement NOT */ if (tree->op & NOTPAT) INVERSE = 1; if (fillup_target(src_index_set, src_offset_table, AND_EXP) <= 0) return -1; INVERSE = prev_INVERSE; return 1; } else if (tree->type == INTERNAL) { /* Search the left node and see if the right node can be searched */ if (search_index(tree->data.internal.left) <= 0) return -1; if (OneFilePerBlock && ((tree->op & OPMASK) == ORPAT) && (src_index_set[REAL_PARTITION - 1] == 1)) goto quit; /* nothing to do */ if ((tree->data.internal.right == NULL) || (tree->data.internal.right->type == 0)) return -1; /* uninitialized: see main.c */ curr_index_set = (unsigned int *)my_malloc(sizeof(int)*REAL_PARTITION); memset(curr_index_set, '\0', sizeof(int)*REAL_PARTITION); /* Save previous src_index_set and src_offset_table in fresh accumulators */ if (OneFilePerBlock) { memcpy(curr_index_set, src_index_set, sizeof(int)*REAL_PARTITION); curr_index_set[REAL_PARTITION - 1] = src_index_set[REAL_PARTITION - 1]; src_index_set[REAL_PARTITION - 1] = 0; curr_index_set[REAL_PARTITION - 2] = src_index_set[REAL_PARTITION - 2]; src_index_set[REAL_PARTITION - 2] = 0; } else memcpy(curr_index_set, src_index_set, MAX_PARTITION * sizeof(int)); if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) { if ((curr_offset_table = (struct offsets **)my_malloc(sizeof(struct offsets *) * OneFilePerBlock)) == NULL) { fprintf(stderr, "%s: malloc failure at: %s:%d\n", GProgname, __FILE__, __LINE__); my_free(curr_index_set, REAL_PARTITION*sizeof(int)); return -1; } memcpy(curr_offset_table, src_offset_table, OneFilePerBlock * sizeof(struct offsets *)); memset(src_offset_table, '\0', sizeof(struct offsets *) * OneFilePerBlock); } /* Now evaluate the right node which automatically put the results in src_index_set/src_offset_table */ if (search_index(tree->data.internal.right) <= 0) { if (curr_offset_table != NULL) free(curr_offset_table); my_free(curr_index_set, REAL_PARTITION*sizeof(int)); return -1; } /* * Alpha substitution of the code in get_index(): * index_tab <- src_index_set * dest_index_table <- curr_index_set * offset_tab <- src_offset_table * dest_offset_table <- curr_offset_table * ret <- src_index_set[REAL_PARTITION - 1] for ORPAT, curr_index_set for ANDPAT * frequency = src_index_set[REAL_PARTITION - 2] in both ORPAT and ANDPAT * first_time <- 0 * return 0 <- goto quit * Slight difference since we want the results to go to src rather than curr. */ if (OneFilePerBlock) { if ((tree->op & OPMASK) == ORPAT) { if (src_index_set[REAL_PARTITION - 1] == 1) { /* curr..[..] can never be 1 since we would have quit above itself */ ret_is_1: src_index_set[REAL_PARTITION - 1] = 1; for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)) - 1; i++) { src_index_set[i] = 0xffffffff; } src_index_set[i] = 0; for (j=0; j<8*sizeof(int); j++) { if (i*8*sizeof(int) + j >= OneFilePerBlock) break; src_index_set[i] |= mask_int[j]; } if (ByteLevelIndex && !NOBYTELEVEL && !(Only_first && !PRINTAPPXFILEMATCH)) for (i=0; i<OneFilePerBlock; i++) { free_list(&curr_offset_table[i]); if (!RecordLevelIndex) free_list(&src_offset_table[i]); /* collect as many offsets as possible with RecordLevelIndex: free src_offset_table at the end of process_query() */ } if (ByteLevelIndex && !RecordLevelIndex) NOBYTELEVEL = 1; goto quit; } src_index_set[REAL_PARTITION - 1] = 0; for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) src_index_set[i] |= curr_index_set[i]; if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) { for (i=0; i<OneFilePerBlock; i++) { sorted_union(&src_offset_table[i], &curr_offset_table[i], &src_index_set[REAL_PARTITION - 2], curr_index_set[REAL_PARTITION - 2], 0); if (!RecordLevelIndex && NOBYTELEVEL) { /* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */ for (iii=0; iii<OneFilePerBlock; iii++) { free_list(&src_offset_table[iii]); free_list(&curr_offset_table[iii]); } break; } } } } else { if (((src_index_set[REAL_PARTITION - 1] == 1) || first_time) && (curr_index_set[REAL_PARTITION - 1] == 1)) { both_are_1: if (first_time) { src_index_set[REAL_PARTITION - 1] = 1; for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)) - 1; i++) { src_index_set[i] = 0xffffffff; } src_index_set[i] = 0; for (j=0; j<8*sizeof(int); j++) { if (i*8*sizeof(int) + j >= OneFilePerBlock) break; src_index_set[i] |= mask_int[j]; } } first_time = 0; if (ByteLevelIndex && !NOBYTELEVEL && !(Only_first && !PRINTAPPXFILEMATCH)) for (i=0; i<OneFilePerBlock; i++) { /* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */ free_list(&curr_offset_table[i]); if (!RecordLevelIndex) free_list(&src_offset_table[i]); /* collect as many offsets as possible with RecordLevelIndex: free src_offset_table at the end of process_query() */ } if (ByteLevelIndex && !RecordLevelIndex) NOBYTELEVEL = 1; /* goto quit; */ } else if ((src_index_set[REAL_PARTITION - 1] == 1) || first_time) { first_time = 0; src_index_set[REAL_PARTITION - 1] = 0; for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) src_index_set[i] = curr_index_set[i]; if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) { for (i=0; i<OneFilePerBlock; i++) { free_list(&src_offset_table[i]); src_offset_table[i] = curr_offset_table[i]; curr_offset_table[i] = NULL; } } } else if (curr_index_set[REAL_PARTITION - 1] == 1) { if (ByteLevelIndex && !NOBYTELEVEL && !(Only_first && !PRINTAPPXFILEMATCH)) /* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */ for (i=0; i<OneFilePerBlock; i++) free_list(&curr_offset_table[i]); } else { for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) src_index_set[i] &= curr_index_set[i]; if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) { if (first_time || WHOLEFILESCOPE) { first_time = 0; for (i=0; i<OneFilePerBlock; i++) { sorted_union(&src_offset_table[i], &curr_offset_table[i], &src_index_set[REAL_PARTITION - 2], curr_index_set[REAL_PARTITION - 2], 0); if (!RecordLevelIndex && NOBYTELEVEL) { /* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */ for (iii=0; iii<OneFilePerBlock; iii++) { free_list(&src_offset_table[iii]); free_list(&curr_offset_table[iii]); } break; } } } else { for (i=0; i<OneFilePerBlock; i++) { if ((src_index_set[block2index(i)] & mask_int[i % (8*sizeof(int))])) sorted_intersection(i, &src_offset_table[i], &curr_offset_table[i], &src_index_set[REAL_PARTITION - 2]); else free_list(&curr_offset_table[i]); /* if (src_index_set[REAL_PARTITION - 2] < MIN_OCCURRENCES) { if (!NOBYTELEVEL) { for (iii=0; iii<OneFilePerBlock; iii++) { free_list(&src_offset_table[iii]); free_list(&curr_offset_table[iii]); } } NOBYTELEVEL = 1; OPTIMIZEBYTELEVEL = 1; break; } */ } } } } } } else { if ((tree->op & OPMASK) == ORPAT) for(i=0; i<MAX_PARTITION; i++) src_index_set[i] |= curr_index_set[i]; else for(i=0; i<MAX_PARTITION; i++) src_index_set[i] &= curr_index_set[i]; } quit: if (curr_offset_table != NULL) free(curr_offset_table); /* Now if it is a NOT expression, complement the indices stuff knowing that it cannot be ByteLevelIndex */ if (tree->op & NOTPAT) { if (ByteLevelIndex) { /* Can't recover the discarded offsets */ fprintf(stderr, "%s: can't handle NOT of AND/OR terms with ByteLevelIndex: please simplify the query\n", HARVEST_PREFIX); my_free(curr_index_set, REAL_PARTITION*sizeof(int)); return -1; } if (OneFilePerBlock) for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) src_index_set[i] = ~src_index_set[i]; else for(i=0; i<MAX_PARTITION; i++) src_index_set[i] = (src_index_set[i] ? 0 : 1); } } else return -1; } else { /* iterative just as it is now: initialize */ if ((long)tree & OR_EXP) { if (OneFilePerBlock) for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) src_index_set[i] = 0; else for(i=0; i<MAX_PARTITION; i++) src_index_set[i] = 0; } else { if (OneFilePerBlock) for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) src_index_set[i] = 0xffffffff; else for(i=0; i<MAX_PARTITION; i++) src_index_set[i] = 1; } src_index_set[REAL_PARTITION - 1] = 0; src_index_set[REAL_PARTITION - 2] = 0; if (split_terminal(0, num_terminals) <= 0) return -1; prev_INVERSE = INVERSE; /* agrep's global to implement NOT */ INVERSE = 0; if (fillup_target(src_index_set, src_offset_table, (long)tree) <= 0) return -1; INVERSE = prev_INVERSE; } return 1;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -