📄 dbacl.c
字号:
i++; /* not found */ /* wrap around */ i = (i >= &learner.hash[learner.max_tokens]) ? learner.hash : i; if( i == loop ) { return NULL; /* when hash table is full */ } } } /* empty slot, so not found */ return i; }/* returns true if the hash could be grown, false otherwise. When the hash is grown, the old values must be redistributed. To do this, we reuse the l_item.lam member as a flag, since this is only needed later during minimization */bool_t grow_learner_hash() { hash_count_t c, new_size; l_item *i, temp_item; if( !(options & (1<<OPTION_GROWHASH)) ) { return 0; } else { if( learner.max_hash_bits < default_max_grow_hash_bits ) { fprintf(stderr, "warning: growing hash table, next time perhaps consider using -h %d\n", learner.max_hash_bits + 1); /* grow the memory around the hash */ if( (i = realloc(learner.hash, sizeof(l_item) * (1<<(learner.max_hash_bits+1)))) == NULL ) { fprintf(stderr, "warning: sorry, failed to grow\n"); return 0; } learner.max_hash_bits++; /* realloc doesn't initialize the memory */ memset(i + learner.max_tokens, 0, ((1<<learner.max_hash_bits) - learner.max_tokens) * sizeof(l_item)); learner.hash = i; /* now mark every used slot */ for(c = 0; c < learner.max_tokens; c++) { if( FILLEDP(&learner.hash[c]) ) { SETMARK(&learner.hash[c]); } } /* now relocate each marked slot and clear it */ new_size = (1<<learner.max_hash_bits) - 1; for(c = 0; c < learner.max_tokens; c++) { while( MARKEDP(&learner.hash[c]) ) { /* find where it belongs */ i = &learner.hash[learner.hash[c].id & new_size]; while( FILLEDP(i) && !MARKEDP(i) ) { i++; i = (i > &learner.hash[new_size]) ? learner.hash : i; } /* guaranteed to exit since hash is larger than original */ /* clear the mark - this must happen after we look for i, since it should be possible to find i == learner.hash[c] */ UNSETMARK(&learner.hash[c]); /* now refile */ if( i != &learner.hash[c] ) { if( MARKEDP(i) ) { /* swap */ memcpy(&temp_item, i, sizeof(l_item)); memcpy(i, &learner.hash[c], sizeof(l_item)); memcpy(&learner.hash[c], &temp_item, sizeof(l_item)); } else { /* copy and clear */ memcpy(i, &learner.hash[c], sizeof(l_item)); memset(&learner.hash[c], 0, sizeof(l_item)); } } /* now &learner.hash[c] is marked iff there was a swap */ } } learner.max_tokens = (1<<learner.max_hash_bits); } else { options &= ~(1<<OPTION_GROWHASH); /* it's the law */ } } return 1;}/* places the token in the global hash and writes the token to a temporary file for later, then updates the digram frequencies */void hash_word_and_learn(char *tok, token_order_t r, regex_count_t re) { hash_value_t id; l_item *i; alphabet_size_t p,q; if( *(tok + 2) ) { /* DIAMOND-DIAMOND is empty string - ignore */ if( overflow_warning ) { return; /* for there be troubles ahead */ } if( options & (1<<OPTION_MULTINOMIAL) ) { r = 1; } if( (options & (1<<OPTION_DECIMATE)) && (rand() > (RAND_MAX>>decimation)) ) { return; } id = (hash_value_t)hash((unsigned char *)tok, strlen(tok), 0); i = find_in_learner(id); if( i && !FILLEDP(i) && ((100 * learner.unique_token_count) >= (HASH_FULL * learner.max_tokens)) && grow_learner_hash() ) { i = find_in_learner(id); /* new i, go through all tests again */ } if( i ) { if( FILLEDP(i) ) { /* redundant :-) */ } else if( /* !FILLEDP(i) && */ ((100 * learner.unique_token_count) < (HASH_FULL * learner.max_tokens) ) ) { /* fill the hash and write to file */ SET(i->id,id); if( learner.unique_token_count < K_TOKEN_COUNT_MAX ) { learner.unique_token_count++; } else { overflow_warning = 1; } i->order = r; /* order accounting */ learner.max_order = (learner.max_order < i->order) ? i->order : learner.max_order; if( learner.fixed_order_unique_token_count[i->order] < K_TOKEN_COUNT_MAX ) { learner.fixed_order_unique_token_count[i->order]++; } else { overflow_warning = 1; } if( (options & (1<<OPTION_DEBUG)) ) { fprintf(stdout, "match %s %d\n", tok, i->order); } /* now save token to temporary file */ fputs(tok, learner.tmp); fputc(-1, learner.tmp); } if( i->count < K_TOKEN_COUNT_MAX ) { i->count++; } else { overflow_warning = 1; } if( learner.full_token_count < K_TOKEN_COUNT_MAX ) { learner.full_token_count++; } else { /* this number is just cosmetic */ } if( learner.fixed_order_token_count[i->order] < K_TOKEN_COUNT_MAX ) { learner.fixed_order_token_count[i->order]++; } else { skewed_constraints_warning = 1; } } if( digramic_overflow_warning ) { return; } /* now update digram frequency counts */ if( r == 1 ) { /* count each token only once */ p = *tok++; /* finally update character frequencies */ while( *tok ) { q = (unsigned char)*tok; if( learner.dig[p][q] < K_TOKEN_COUNT_MAX ) { learner.dig[p][q]++; } else { digramic_overflow_warning = 1; } p = q; tok++; } if( digramic_overflow_warning ) { fprintf(stderr, "warning: ran out of integers (too much data), " "reference measure may be skewed.\n"); } } if( overflow_warning ) { fprintf(stderr, "warning: ran out of integers (too much data), " "results may be skewed.\n"); } }}/* initialize global learner object */void init_learner() { alphabet_size_t i, j; regex_count_t c; token_order_t z; /* some constants */ learner.max_tokens = default_max_tokens; learner.max_hash_bits = default_max_hash_bits; learner.full_token_count = 0; learner.unique_token_count = 0; learner.logZ = 0.0; learner.max_order = 0; /* init character frequencies */ for(i = 0; i < ASIZE; i++) { for(j = 0; j < ASIZE; j++) { learner.dig[i][j] = 0.0; } } /* open temporary file for writing tokens */ learner.tmp = tmpfile(); /* allocate room for hash */ learner.hash = calloc(learner.max_tokens, sizeof(l_item)); if( !learner.hash ) { fprintf(stderr, "error: not enough memory? I couldn't allocate %li bytes\n", (sizeof(l_item) * ((long int)learner.max_tokens))); exit(0); } for(c = 0; c < MAX_RE + 1; c++) { learner.regex_token_count[c] = 0; } for(z = 0; z < MAX_SUBMATCH; z++) { learner.fixed_order_token_count[z] = 0; learner.fixed_order_unique_token_count[z] = 0; } if( options & (1<<OPTION_MBOX_FORMAT) ) { reset_mbox_messages(); }}/* initializes to the Laplacian parameters */ void init_dirichlet() { alphabet_size_t i; dirichlet.alpha = 1.0; for(i = 0; i < ASIZE; i++) { dirichlet.u[i] = 1.0/((double)ASIZE); }}/* calculates the most probable Dirichlet parameters given digram counts. Method described in MacKay and Peto (1995) Since we don't count the DIAMOND-DIAMOND digram, including the prior will overestimate the DIAMOND-DIAMOND transition probability. However, for this transition as well as the others, the Dirichlet will have practically no effect when there's enough data in the corpus.*/void optimize_dirichlet() { alphabet_size_t i, j; token_count_t k; float G[ASIZE]; float H[ASIZE]; float V[ASIZE]; float F[ASIZE]; double prev_alpha; double K, t; /* precompute useful constants */ for(j = 1; j < ASIZE; j++) { G[j] = 0.0; H[j] = 0.0; V[j] = 0.0; F[j] = 0.0; for(i = 1; i < ASIZE; i++) { for(k = 1; k < learner.dig[i][j]; k++) { G[j] += 1.0/k; H[j] += 1.0/((double)k*k); } if( learner.dig[i][j] > 0 ) { V[j]++; F[j] += learner.dig[i][j]; } } if(0 && options & (1<<OPTION_DEBUG) ) { fprintf(stdout, "symbol %d has G = %g H = %g V = %g F = %g\n", j, G[j], H[j], V[j], F[j]); } } /* now optimize the dirichlet parameters */ do { prev_alpha = dirichlet.alpha; K = 0.0; for(i = 1; i < ASIZE; i++) { K += log((F[i] + prev_alpha)/prev_alpha) + (0.5) * F[i]/(prev_alpha * (F[i] + prev_alpha)); } dirichlet.alpha = 0.0; for(j = 1; j < ASIZE; j++) { t = K - G[j]; dirichlet.u[j] = 2 * V[j] /(t + sqrt(t * t + 4 * H[j] * V[j])); dirichlet.alpha += dirichlet.u[j]; } if( options & (1<<OPTION_VERBOSE) ) { fprintf(stdout, "dirichlet alpha %g --> %g\n", prev_alpha, dirichlet.alpha); } } while( fabs(prev_alpha - dirichlet.alpha) > TOL ); if(0 && options & (1<<OPTION_DEBUG) ) { for(j = 1; j < ASIZE; j++) { fprintf(stdout, "dirichlet u[%d] = %g\n", j, dirichlet.u[j]); } }}/* computes the _logarithm_ of digram transition probabilities */void compute_digram_probabilities() { alphabet_size_t i,j; weight_t t; for(i = 1; i < ASIZE; i++) { t = 0.0; for(j = 1; j < ASIZE; j++) { t += learner.dig[i][j]; } for(j = 1; j < ASIZE; j++) { learner.dig[i][j] = log(((double)learner.dig[i][j] + dirichlet.u[j]) / (t + dirichlet.alpha)); /* simulate the effect of digitizing the digrams */ learner.dig[i][j] = UNPACK_DIGRAMS(PACK_DIGRAMS(learner.dig[i][j])); if(0 && options & (1<<OPTION_DEBUG) ) { fprintf(stdout, "learner.dig[%d][%d] = %f\n", i, j, learner.dig[i][j]); } } }}weight_t calc_learner_digramic_excursion(char *tok) { register alphabet_size_t p, q; register weight_t t = 0.0; /* now update digram frequency counts */ p = (unsigned char)*tok++; /* finally update character frequencies */ while( *tok ) { q = (unsigned char)*tok; t += learner.dig[p][q]; p = q; tok++; } return t;}/* calculates the rth-order divergence but needs normalizing constant note: this isn't the full divergence from digref, just the bits needed for the r-th optimization */score_t learner_divergence(score_t z, token_order_t r) { register hash_count_t i; register token_count_t c = 0; score_t t = 0.0; for(i = 0; i < learner.max_tokens; i++) { if( FILLEDP(&learner.hash[i]) && (learner.hash[i].order == r) ) { t += (learner.hash[i].lam) * learner.hash[i].count; if( ++c >= learner.fixed_order_unique_token_count[r] ) { c = 0; break; } } } return -log(z)/((weight_t)r) + t/((weight_t)learner.fixed_order_token_count[r]);}/* calculates the normalizing constant */score_t learner_Z(token_order_t r, score_t unchanging_part) { register hash_count_t i; register token_count_t c = 0; score_t t = 0.0; for(i = 0; i < learner.max_tokens; i++) { if( FILLEDP(&learner.hash[i]) && (learner.hash[i].order == r) ) { t += (exp(r * (learner.hash[i].lam)) - 1.0) * exp(r * UNPACK_LWEIGHTS(learner.hash[i].ltrms) + UNPACK_LWEIGHTS(learner.hash[i].dref)); if( ++c >= learner.fixed_order_unique_token_count[r] ) { c = 0; break; } } } /* investigate this someday */ if( isnan(t + unchanging_part) ) { fprintf(stderr, "error: sorry, partition function went kaboom.\n"); exit(0); } return t + unchanging_part;}/* fills the hash with partial calculations */void fill_ref_vars(l_item *k, char *tok, token_order_t r) { hash_value_t id; char *t; l_item *l; /* weight of the r-th order excursion */ k->dref = PACK_LWEIGHTS(calc_learner_digramic_excursion(tok)); /* for each prefix of tok, add its weight */ k->ltrms = PACK_LWEIGHTS(0.0); for( t = tok + strlen(tok) - 1; t > tok + 1; t-- ) { if( *(t - 1) == DIAMOND ) { *t = '\0'; id = (hash_value_t)hash((unsigned char *)tok, t - tok, 0); l = find_in_learner(id); /* guaranteed to be found */ k->ltrms += PACK_LWEIGHTS(l->lam); } }}/* fills hash with partial calculations and returns the unchanging part of the normalizing constant */score_t recalculate_reference_measure(token_order_t r) { hash_value_t id; char buf[BUFLEN]; char tok[MAX_TOKEN_LEN]; char *p, *q; l_item *k; score_t partial_z = 1.0; /* now we calculate the logarithmic word weight from the digram model, for each token in the hash */ rewind(learner.tmp); q = tok; while( !feof(learner.tmp) && fgets(buf, BUFLEN, learner.tmp) ) { p = buf; while( *p ) { if( *p != -1) { *q++ = *p; /* copy into tok */ } else { /* interword space */ *q = 0; /* append NUL to tok */ /* now write weight in hash */ id = (hash_value_t)hash((unsigned char *)tok, strlen(tok), 0); k = find_in_learner(id); /* guaranteed to be found */ fill_ref_vars(k, tok, r); /* now calculate the unchanging part of the normalizing constant */ if(k->order < r) { partial_z += (exp(r * (k->lam)) - 1.0) * exp(r * UNPACK_LWEIGHTS(k->ltrms) + UNPACK_LWEIGHTS(k->dref)); } q = tok; /* reset q */ } p++;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -