📄 build_in.c
字号:
printf("src_index_buf = %s", src_index_buf); printf("dest_index_buf = %s", dest_index_buf);#endif if (!CountWords && !ByteLevelIndex) { /* have to look for common indices and exclude them */ int oldbdx1, oldbdx2; merge_index_buf[0] = '\0'; merge_len = 0; oldbdx1 = bdx1 = 0; /* src_index_buf[src_end_pt] is '\0', src_index_buf[src_end_pt-1] is '\n' */ while((bdx1 < src_end_pt) && (src_index_buf[bdx1] != WORD_END_MARK) && (src_index_buf[bdx1] != ALL_INDEX_MARK)) bdx1 ++; if ((bdx1 > oldbdx1) && (bdx1 < src_end_pt)) { /* src_index_buf[bdx1] is the word-end-mark */ src_mark = src_index_buf[bdx1]; src_index_buf[bdx1] = '\0'; /* terminate word */ strcpy(merge_index_buf, src_index_buf); /* save the word itself */ merge_len = strlen(src_index_buf); /* merge_index_buf[merge_len] is '\0', merge_index_buf[merge_len-1] is a part of the word */ bdx1 ++; /* skip word end marker */ if (StructuredIndex) bdx1 += 2; /* skip attribute field */ } even_words = 1; src_num = 0; if (OneFilePerBlock) memset((char *)src_index_set, '\0', sizeof(int)*REAL_PARTITION); else memset((char *)src_index_set, '\0', sizeof(int) * (MAX_PARTITION + 1)); while(bdx1 < src_end_pt - 1) { if (OneFilePerBlock) { x = 0; if (file_num <= MaxNum8bPartition) { x = decode8b(src_index_buf[bdx1]); bdx1 ++; } else if (file_num <= MaxNum12bPartition) { if (even_words) { x = ((src_index_buf[bdx1+1] & 0x0000000f) << 8) | src_index_buf[bdx1]; x = decode12b(x); bdx1 += 2; even_words = 0; } else { /* odd number of words so far */ x = ((src_index_buf[bdx1-1] & 0x000000f0) << 4) | src_index_buf[bdx1]; x = decode12b(x); bdx1 ++; even_words = 1; } } else if (file_num <= MaxNum16bPartition) { x = (src_index_buf[bdx1] << 8) | src_index_buf[bdx1+1]; x = decode16b(x); bdx1 += 2; } else { x = (src_index_buf[bdx1] << 16) | (src_index_buf[bdx1+1] << 8) | src_index_buf[bdx1+2]; x = decode24b(x); bdx1 += 3; } src_index_set[block2index(x)] |= mask_int[x % (8*sizeof(int))]; src_num ++; } else src_index_set[src_num++] = src_index_buf[bdx1++]; } oldbdx2 = bdx2 = 0; /* dest_index_buf[dest_end_pt] is '\0', dest_index_buf[dest_end_pt-1] is '\n' */ while((bdx2 < dest_end_pt) && (dest_index_buf[bdx2] != WORD_END_MARK) && (dest_index_buf[bdx2] != ALL_INDEX_MARK)) bdx2 ++; if ((bdx2 > oldbdx2) && (bdx2 < dest_end_pt)) { /* dest_index_buf[bdx2] is the word-end-mark */ dest_mark = dest_index_buf[bdx2]; dest_index_buf[bdx2] = '\0'; /* terminate word */ if (merge_len == 0) { strcpy(merge_index_buf, dest_index_buf); /* save the word itself */ merge_len = strlen(merge_index_buf); /* merge_index_buf[merge_len] is '\0', merge_index_buf[merge_len-1] is a part of the word */ } bdx2 ++; /* skip word end marker */ if (StructuredIndex) bdx2 += 2; /* skip attribute field */ } even_words = 1; dest_num = 0; if (OneFilePerBlock) memset((char *)dest_index_set, '\0', sizeof(int)*REAL_PARTITION); else memset((char *)dest_index_set, '\0', sizeof(int) * (MAX_PARTITION + 1)); while(bdx2 < dest_end_pt - 1) { if (OneFilePerBlock) { x = 0; if (file_num <= MaxNum8bPartition) { x = decode8b(dest_index_buf[bdx2]); bdx2 ++; } else if (file_num <= MaxNum12bPartition) { if (even_words) { x = ((dest_index_buf[bdx2+1] & 0x0000000f) << 8) | dest_index_buf[bdx2]; x = decode12b(x); bdx2 += 2; even_words = 0; } else { /* odd number of words so far */ x = ((dest_index_buf[bdx2-1] & 0x000000f0) << 4) | dest_index_buf[bdx2]; x = decode12b(x); bdx2 ++; even_words = 1; } } else if (file_num <= MaxNum16bPartition) { x = (dest_index_buf[bdx2] << 8) | dest_index_buf[bdx2+1]; x = decode16b(x); bdx2 += 2; } else { x = (dest_index_buf[bdx2] << 16) | (dest_index_buf[bdx2+1] << 8) | dest_index_buf[bdx2+2]; x = decode24b(x); bdx2 += 3; } dest_index_set[block2index(x)] |= mask_int[x % (8*sizeof(int))]; dest_num ++; } else dest_index_set[dest_num++] = dest_index_buf[bdx2++]; } even_words = 1; if (merge_len > 0) { if(OneFilePerBlock && ((src_mark == ALL_INDEX_MARK) || (dest_mark == ALL_INDEX_MARK) || ((file_num > MaxNum8bPartition) && (src_num + dest_num > file_num*MAX_INDEX_PERCENT / 100)) )) { merge_index_buf[merge_len++] = ALL_INDEX_MARK; if (StructuredIndex) { merge_index_buf[merge_len++] = (attr1 & 0xff00) >> 8; merge_index_buf[merge_len++] = (attr1 & 0xff); } if (file_num <= MaxNum8bPartition) merge_index_buf[merge_len ++] = encode8b(DONT_CONFUSE_SORT); else if (file_num <= MaxNum12bPartition) { merge_index_buf[merge_len ++] = (encode12b(DONT_CONFUSE_SORT) & 0x00000f00) >> 8; merge_index_buf[merge_len ++] = encode12b(DONT_CONFUSE_SORT) & 0x000000ff; } else if (file_num <= MaxNum16bPartition) { merge_index_buf[merge_len ++] = (encode16b(DONT_CONFUSE_SORT) & 0x0000ff00) >> 8; merge_index_buf[merge_len ++] = encode16b(DONT_CONFUSE_SORT) & 0x000000ff; } else { merge_index_buf[merge_len ++] = (encode24b(DONT_CONFUSE_SORT) & 0x00ff0000) >> 16; merge_index_buf[merge_len ++] = (encode24b(DONT_CONFUSE_SORT) & 0x0000ff00) >> 8; merge_index_buf[merge_len ++] = encode24b(DONT_CONFUSE_SORT) & 0x000000ff; } goto final_merge; } merge_index_buf[merge_len++] = WORD_END_MARK; if (StructuredIndex) { merge_index_buf[merge_len++] = (attr1 & 0xff00) >> 8; merge_index_buf[merge_len++] = (attr1 & 0xff); } if (OneFilePerBlock) { for (i=0; i<round(file_num, 8*sizeof(int)); i++) dest_index_set[i] |= src_index_set[i]; /* take union */ for (i=0; i<round(file_num, 8*sizeof(int)); i++) if (dest_index_set[i]) for (j=0; j<8*sizeof(int); j++) if (dest_index_set[i] & mask_int[j]) { x = i*8*sizeof(int) + j; if (file_num <= MaxNum8bPartition) { merge_index_buf[merge_len++] = encode8b(x); } else if (file_num <= MaxNum12bPartition) { x = encode12b(x); if (even_words) { merge_index_buf[merge_len++] = x & 0x000000ff; /* lsb */ y = (x & 0x00000f00)>>8; /* msb */ even_words = 0; } else { /* odd number of words so far */ y |= (x&0x00000f00)>>4; /* msb of x into msb of y */ merge_index_buf[merge_len ++] = y; merge_index_buf[merge_len ++] = x&0x000000ff; even_words = 1; } } else if (file_num <= MaxNum16bPartition) { x = encode16b(x); merge_index_buf[merge_len ++] = (x&0x0000ff00)>>8; merge_index_buf[merge_len ++] = x&0x000000ff; } else { x = encode24b(x); merge_index_buf[merge_len ++] = (x&0x00ff0000)>>16; merge_index_buf[merge_len ++] = (x&0x0000ff00)>>8; merge_index_buf[merge_len ++] = x&0x000000ff; } } if (!even_words && (file_num > MaxNum8bPartition) && (file_num <= MaxNum12bPartition)) merge_index_buf[merge_len ++] = y; } else { /* normal indexing */ for (i=0; i<src_num; i++) { merge_index_buf[merge_len++] = src_index_set[i]; } for (j=0; j<dest_num; j++) { for (i=0; i<src_num; i++) if (dest_index_set[j] == src_index_set[i]) break; if (i>=src_num) /* did not find match */ merge_index_buf[merge_len++] = dest_index_set[j]; /* Doesn't matter if dest_index_set is int-array (merge_index_buf being char array) since dest_index_set has only a char */ } } final_merge: merge_index_buf[merge_len++] = '\n'; merge_index_buf[merge_len] = '\0'; fputs(merge_index_buf, f3); /* fprintf(stderr, "%d+%d=%d ", src_end_pt, dest_end_pt, merge_len); */ } /* merge_len > 0 */ } else if (CountWords) { /* indices are frequencies, so just merge them: OneFilPerBlock is ignored */ strcpy(merge_index_buf, src_index_buf); bdx = strlen(merge_index_buf); /* merge_index_buf[bdx] is '\0', merge_index_buf[bdx-1] is '\n' */ if (bdx > 1) bdx--; /* now merge_index_buf[bdx] is '\n', merge_index_buf[bdx-1] is the last index */ bdx2 = 0; /* find the first index */ if (IndexNumber) while(isalnum(dest_index_buf[bdx2])) bdx2 ++; else while(isalpha(dest_index_buf[bdx2])) bdx2++; /* to skip over the word-end marker of dest_index_buf (which is a blank) */ if (bdx2 > 0) bdx2 ++; if (StructuredIndex) bdx2 += 2; /* this is a nop since CountWords and StructuredIndex don't make sense together */ if (bdx >= 1) { merge_index_buf[bdx++] = ' '; /* blank separated fscanf-able list of integers representing counts */ } /* append the indices of word1 to the buffer */ if (dest_index_buf[bdx2] > 0) { while(dest_index_buf[bdx2]>0) merge_index_buf[bdx++] = dest_index_buf[bdx2++]; /* '\n' gets copied */ merge_index_buf[bdx] = '\0'; } /* else, no need to copy anything */ fputs(merge_index_buf, f3); } else { /* indices are actual occurrences (ByteLevelIndex), so just cat them one after the other, src first since that is i2, the 1st one */ /* First put out the word */ bdx1 = 0; while ((bdx1<src_end_pt) && (src_index_buf[bdx1] != WORD_END_MARK) && (src_index_buf[bdx1] != ALL_INDEX_MARK) && (src_index_buf[bdx1] != '\n') && (src_index_buf[bdx1] != '\0')) putc(src_index_buf[bdx1 ++], f3); /* Now check what end-mark we should put */ if ((bdx1 >= src_end_pt) || (src_index_buf[bdx1] == ALL_INDEX_MARK) || (src_end_pt + dest_end_pt >= MAX_SORTLINE_LEN)) { putc(ALL_INDEX_MARK, f3); if (StructuredIndex) { putc((attr1&0xff00) >> 8, f3); putc((attr1&0xff), f3); } putc(DONT_CONFUSE_SORT, f3); putc('\n', f3); } else { /* dest can be all index mark */ bdx2 = 0; while ((bdx2<dest_end_pt) && (dest_index_buf[bdx2] != WORD_END_MARK) && (dest_index_buf[bdx2] != ALL_INDEX_MARK) && (dest_index_buf[bdx2] != '\n') && (dest_index_buf[bdx2] != '\0')) bdx2 ++; if ((bdx2 >= dest_end_pt) || (dest_index_buf[bdx2] == ALL_INDEX_MARK)) { putc(ALL_INDEX_MARK, f3); if (StructuredIndex) { putc((attr1&0xff00) >> 8, f3); putc((attr1&0xff), f3); } putc(DONT_CONFUSE_SORT, f3); putc('\n', f3); } else { /* we have to put out both the lists */ putc(WORD_END_MARK, f3); bdx1 ++; /* skip over WORD_END_MARK */ if (StructuredIndex) { putc((attr1&0xff00) >> 8, f3); putc((attr1&0xff), f3); bdx1 += 2; } while ((bdx1 < src_end_pt) && (src_index_buf[bdx1] != '\n') && (src_index_buf[bdx1] != '\0')) putc(src_index_buf[bdx1++], f3); fputc(encode8b(0), f3); /* instead of the '\n' after end of src_index_buf */ bdx2 ++; /* skip over WORD_END_MARK */ if (StructuredIndex) bdx2 += 2; while ((bdx2 < dest_end_pt) && (dest_index_buf[bdx2] != '\n') && (dest_index_buf[bdx2] != '\0')) putc(dest_index_buf[bdx2++], f3); putc('\n', f3); } } }#if debug printf("merge_index_buf = %s", merge_index_buf);#endif /*debug*/ memset(dest_index_buf, '\0', dest_end_pt+2); if(fgets(dest_index_buf, REAL_INDEX_BUF, f2) == 0) { dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; TAIL1 = ON; break; } dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; dest_end_pt = strlen(dest_index_buf); scanword(word2, dest_index_buf, dest_index_buf+dest_end_pt, &attr2); } else { /* word1 < word2, so output src_index_buf */ fputs(src_index_buf, f3); } memset(src_index_buf, '\0', src_end_pt+2); } if(TAIL1) { if(cmp != 0) fputs(src_index_buf, f3); memset(src_index_buf, '\0', src_end_pt+2); while(fgets(src_index_buf, REAL_INDEX_BUF, f1)) { src_index_buf[REAL_INDEX_BUF - 1] = '\0'; src_end_pt = strlen(src_index_buf); fputs(src_index_buf, f3); memset(src_index_buf, '\0', src_end_pt+2); } } else { /* output the tail of f2 */ fputs(dest_index_buf, f3); memset(dest_index_buf, '\0', dest_end_pt+2); while(fgets(dest_index_buf, REAL_INDEX_BUF, f2)) { dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; dest_end_pt = strlen(dest_index_buf); fputs(dest_index_buf, f3); memset(dest_index_buf, '\0', dest_end_pt+2); } } return;}remove_filename(fileindex, new_partition) int fileindex, new_partition;{ if ((fileindex < 0) || (fileindex >= MaxNum24bPartition)) return;#if BG_DEBUG fprintf(LOGFILE, "removing %s from index\n", LIST_GET(name_list, fileindex)); memory_usage -= (strlen(LIST_GET(name_list, fileindex)) + 2);#endif /*BG_DEBUG*/ my_free(LIST_GET(name_list, fileindex), 0); LIST_SUREGET(name_list, fileindex) = NULL; if ((disable_list != NULL) && (fileindex < old_file_num)) disable_list[block2index(fileindex)] |= mask_int[fileindex % (8*sizeof(int))];}/* returns the set of deleted files in the struct indices format: note that by construction, the list is sorted according to fileindex */struct indices*get_removed_indices(){ int i, j; char *name; struct indices *head = NULL, **tail, *new; tail = &head; for (i=0; i<file_num; i++) { name=LIST_GET(name_list, i);#if 0 if ((name == NULL) || (name[0] == '\0')) printf("DEL %d\n", i);#endif if ((name == NULL) || (name[0] == '\0')) { if ((*tail == NULL) || ((*tail)->index[INDEX_SET_SIZE - 1] != INDEX_ELEM_FREE)) { new = (struct indices *)my_malloc(sizeof(struct indices)); memset(new, '\0', sizeof(struct indices)); for (j=0; j<INDEX_SET_SIZE; j++) new->index[j] = INDEX_ELEM_FREE; if (*tail != NULL) { (*tail)->next_i = new; tail = &(*tail)->next_i; } else head = new; } for (j=0; j<INDEX_SET_SIZE; j++) if ((*tail)->index[j] == INDEX_ELEM_FREE) break; /* j must be < INDEX_SET_SIZE */ (*tail)->index[j] = i; } } return head;}/* returns a -ve number if there is no newfileindex for this file (deleted from index), or the new index otherwise *//* length_of_deletedlist = MaxNum24bPartition + 1 - get_new_index(deletedlist, MaxNum24bPartition + 1); */get_new_index(deletedlist, oldfileindex) struct indices *deletedlist; int oldfileindex;{ int j; int reduction = 0; struct indices *head = deletedlist; while (head!=NULL) { for (j=0; j<INDEX_SET_SIZE; j++) { if (head->index[j] == INDEX_ELEM_FREE) return oldfileindex; /* crossed the limit */ else if (oldfileindex == head->index[j]) return -1; /* oldfileindex has been deleted now */ else if (oldfileindex > head->index[j]) oldf
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -