📄 catfun.c
字号:
"warning: empirical hash full, calculation may be skewed. " "Try option -h %d\n", (empirical.max_hash_bits + 1)); hashfull_warning = 1; } return; /* pretend word doesn't exist */ } if( empirical.track_features ) { if( empirical.feature_stack_top < MAX_TOKEN_LINE_STACK ) { empirical.feature_stack[empirical.feature_stack_top++] = h; } else { empirical.track_features = 0; empirical.feature_stack_top = 0; } } } empirical.full_token_count += ( empirical.full_token_count < K_TOKEN_COUNT_MAX ) ? 1 : 0; } } /* now do scoring for all available categories */ for(i = 0; i < cat_count; i++) { switch(cat[i].model_type) { case simple: /* see if this is for us */ if( ((re == 0) && (r == 1)) || ((re > 0) && (cat[i].retype & (1<<(re-1)))) ) { /* if token found, add its lambda weight */ k = find_in_category(&cat[i], id); if( k ) { cat[i].score += UNPACK_LAMBDA(k->lam); } /* now compute the reference weight from digram model */ pp = (unsigned char)*tok; q = tok + 1; while( *q ) { if( *q == '\r' ) { q++; continue; } pc = (unsigned char)*q; cat[i].score += UNPACK_DIGRAMS(cat[i].dig[pp][pc]); pp = pc; q++; } /* don't forget the normalizing constant */ cat[i].score -= cat[i].logZ; /* update complexity for this category */ cat[i].complexity++; /* add correction factor for multinomial case */ if(h) { cat[i].score += log((weight_t)cat[i].complexity) - log((weight_t)h->count); } } break; case sequential: default: /* see if this is for us */ if( ((re == 0) && !cat[i].retype) || ((re > 0) && (cat[i].retype & (1<<(re-1)))) ) { /* if token found, add its lambda weight */ k = find_in_category(&cat[i], id); if( k ) { cat[i].score += UNPACK_LAMBDA(k->lam); } if( r == 1 ) { /* now compute the reference weight from digram model */ pp = (unsigned char)*tok; q = tok + 1; while( *q ) { if( *q == '\r' ) { q++; continue; } pc = (unsigned char)*q; cat[i].score += UNPACK_DIGRAMS(cat[i].dig[pp][pc]); pp = pc; q++; } } if( r == cat[i].max_order ) { /* don't forget the normalizing constant */ cat[i].score -= cat[i].logZ; /* update complexity for this category */ cat[i].complexity++; } } break; } } }}/*********************************************************** * FILE MANAGEMENT FUNCTIONS * ***********************************************************//* loads a category hash */error_code_t load_category(Category *cat) { hash_count_t i, j; regex_count_t c; char buf[MAGIC_BUFSIZE]; char scratchbuf[MAGIC_BUFSIZE]; char *p; short int shint_val; long int lint_val1, lint_val2, lint_val3; FILE *input;#if defined HAVE_LIBBOOST_REGEX charbuf_len_t r;#endif if( (input = fopen(cat->filename, "rb")) ) { fgets(buf, MAGIC_BUFSIZE, input); if( strncmp(buf, MAGIC1, 16 + strlen(VERSION)) ) { fprintf(stderr, "error: not a dbacl version "VERSION" category file [%s]\n", cat->filename); return 0; } init_category(cat); /* changes filename */ fgets(buf, MAGIC_BUFSIZE, input); sscanf(buf, MAGIC2_i, &cat->divergence, &cat->logZ, &shint_val, scratchbuf); cat->max_order = (token_order_t)shint_val; if( scratchbuf[0] == 'm' ) { cat->model_type = simple; } else { cat->model_type = sequential; } fgets(buf, MAGIC_BUFSIZE, input); sscanf(buf, MAGIC3, &shint_val, &lint_val1, &lint_val2, &lint_val3); cat->max_hash_bits = (token_order_t)shint_val; cat->model_full_token_count = (token_count_t)lint_val1; cat->model_unique_token_count = (token_count_t)lint_val2; cat->model_num_docs = (document_count_t)lint_val3; cat->max_tokens = (1<<cat->max_hash_bits); /* see if there are any regexes */ fgets(buf, MAGIC_BUFSIZE, input); while(1) { if( strncmp(buf, MAGIC6, 2) == 0 ) { break; } else if( strncmp(buf, MAGIC5_i, 8) == 0 ) { /* set up the submatch bitmap */ re[regex_count].submatches |= 0; if( (p = strrchr(buf + RESTARTPOS, '|')) && ( *(--p) == '|') ) { /* assume string ends in ||12345, use as bitmap */ *p = '\0'; for(p += 2; *p; p++) { /* assume ascii number positions */ if( (*p > '9') || (*p < '1')) { if( *p != '\n' ) { fprintf(stderr, "warning: could not decode || suffix for %s while loading %s\n", buf + RESTARTPOS, cat->filename); } } else { re[regex_count].submatches |= (1<<(*p - '0')); } } } else { /* no bitmap specified */ re[regex_count].submatches = ~0; }#if defined HAVE_LIBBOOST_REGEX /* boost regexes accept wide characters */ r = strlen(buf + RESTARTPOS); re[regex_count].string = malloc((r + 1) * sizeof(wchar_t)); /* note: skip the trailing newline */ r = mbstowcs(re[regex_count].string, buf + RESTARTPOS, r - 1); if( r <= 0 ) { fprintf(stderr, "error: couldn't convert regular expression '%s'.\n", buf + RESTARTPOS); exit(0); /* no point in going on */ } else { re[regex_count].string[r] = L'\0'; } /* now see if we already have this string */ for(c = 0; c < regex_count; c++) { if( wcscmp(re[c].string, re[regex_count].string) == 0 ) { break; } } if( c < regex_count ) { /* we found this regex, no need to repeat it */ free(re[regex_count].string); } else { /* compile the regex (uses regcompW) */ if( regcomp(&re[regex_count].regex, re[regex_count].string, REG_EXTENDED) != 0 ) { fprintf(stderr, "error: could not compile regular expression '%ls' for %s.\n", re[regex_count].string, cat->filename); exit(0); } else { regex_count++; if( regex_count >= MAX_RE ) { fprintf(stderr, "error: too many regular expressions\n"); exit(0); /* no point in going on */ } } }#else /* remove trailing newline */ if( buf[strlen(buf) - 1] == '\n' ) { buf[strlen(buf) - 1] = '\0'; } /* GNU regexes use ordinary strings */ /* it's a regex - see if we've got it already */ for(c = 0; c < regex_count; c++) { if( strcmp(re[c].string, buf + RESTARTPOS) == 0 ) { break; } } if( c >= regex_count ) { /* not found */ /* add it to our list */ re[regex_count].string = strdup(buf + RESTARTPOS); /* and compile the regex */ if( regcomp(&re[regex_count].regex, re[regex_count].string, REG_EXTENDED) != 0 ) { fprintf(stderr, "error: could not compile regular expression '%s' for %s.\n", re[regex_count].string, cat->filename); exit(0); /* not much point going on */ } else { regex_count++; if( regex_count >= MAX_RE ) { fprintf(stderr, "error: too many regular expressions\n"); exit(0); /* no point in going on */ } } }#endif /* now flag the regex as taken */ cat->retype |= (1<<c); } else if( strncmp(buf, MAGIC4, 10) == 0) { /* it's the case sensitive flag */ options |= (1<<OPTION_CASEN); } else if( strncmp(buf, MAGIC7, 5) == 0) { options |= (1<<OPTION_I18N); } /* finished with current line, get next one */ fgets(buf, MAGIC_BUFSIZE, input); } /* if this category did not register a regex, it wants the default processing, so we flag this */ if( !cat->retype ) { options |= (1<<OPTION_NOREGEX); } /* read character frequencies */ fread(cat->dig, SIZEOF_DIGRAMS, ASIZE * ASIZE, input); /* allocate hash table */ if( !(cat->hash = malloc(sizeof(c_item) * cat->max_tokens)) ) { fprintf(stderr, "error: not enough memory for category %s\n", cat->filename); return 0; } /* read in hash table */ i = cat->max_tokens; j = 0; while( j < i ) { j += fread(cat->hash + j, sizeof(c_item), i, input); } fclose(input); } else { fprintf(stderr, "error: cannot open file for reading %s\n", cat->filename); return 0; } return 1;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -