📄 dbacl.c

📁 dbacl是一个通用目的的digramic贝叶斯文本分类器。它可以学习你提供的文本
💻 C
📖 第 1 页 / 共 4 页
字号:
	    i++; /* not found */	    /* wrap around */	    i = (i >= &learner.hash[learner.max_tokens]) ? learner.hash : i; 	    if( i == loop ) {		return NULL; /* when hash table is full */	    }	}    }    /* empty slot, so not found */    return i; }/* returns true if the hash could be grown, false otherwise.   When the hash is grown, the old values must be redistributed.   To do this, we reuse the l_item.lam member as a flag, since this   is only needed later during minimization */bool_t grow_learner_hash() {  hash_count_t c, new_size;  l_item *i, temp_item;  if( !(options & (1<<OPTION_GROWHASH)) ) {    return 0;  } else {    if( learner.max_hash_bits < default_max_grow_hash_bits ) {      fprintf(stderr,	      "warning: growing hash table, next time perhaps consider using -h %d\n",	      learner.max_hash_bits + 1);      /* grow the memory around the hash */      if( (i = realloc(learner.hash, 		       sizeof(l_item) * 		       (1<<(learner.max_hash_bits+1)))) == NULL ) {	fprintf(stderr,		"warning: sorry, failed to grow\n");	return 0;      }      learner.max_hash_bits++;      /* realloc doesn't initialize the memory */      memset(i + learner.max_tokens, 0, 	     ((1<<learner.max_hash_bits) - learner.max_tokens) * 	     sizeof(l_item));      learner.hash = i;             /* now mark every used slot */      for(c = 0; c < learner.max_tokens; c++) {	if( FILLEDP(&learner.hash[c]) ) {	  SETMARK(&learner.hash[c]);	}      }      /* now relocate each marked slot and clear it */      new_size = (1<<learner.max_hash_bits) - 1;      for(c = 0; c < learner.max_tokens; c++) {	while( MARKEDP(&learner.hash[c]) ) {	  /* find where it belongs */	  i = &learner.hash[learner.hash[c].id & new_size];	  while( FILLEDP(i) && !MARKEDP(i) ) {	    i++;	    i = (i > &learner.hash[new_size]) ? learner.hash : i;	  } /* guaranteed to exit since hash is larger than original */	  /* clear the mark - this must happen after we look for i,	   since it should be possible to find i == learner.hash[c] */	  UNSETMARK(&learner.hash[c]); 	  /* now refile */	  if( i != &learner.hash[c] ) {	    if( MARKEDP(i) ) {	      /* swap */	      memcpy(&temp_item, i, sizeof(l_item));	      memcpy(i, &learner.hash[c], sizeof(l_item));	      memcpy(&learner.hash[c], &temp_item, sizeof(l_item));	    } else {	      /* copy and clear */	      memcpy(i, &learner.hash[c], sizeof(l_item));	      memset(&learner.hash[c], 0, sizeof(l_item));	    }	  } 	  /* now &learner.hash[c] is marked iff there was a swap */	}      }      learner.max_tokens = (1<<learner.max_hash_bits);    } else {      options &= ~(1<<OPTION_GROWHASH); /* it's the law */    }  }  return 1;}/* places the token in the global hash and writes the   token to a temporary file for later, then updates   the digram frequencies */void hash_word_and_learn(char *tok, token_order_t r, regex_count_t re) {  hash_value_t id;  l_item *i;  alphabet_size_t p,q;  if( *(tok + 2) ) { /* DIAMOND-DIAMOND is empty string - ignore */    if( overflow_warning ) {      return; /* for there be troubles ahead */    }    if( options & (1<<OPTION_MULTINOMIAL) ) { r = 1; }    if( (options & (1<<OPTION_DECIMATE)) &&	(rand() > (RAND_MAX>>decimation)) ) {      return;     }    id = (hash_value_t)hash((unsigned char *)tok, strlen(tok), 0);    i = find_in_learner(id);    if( i && !FILLEDP(i) &&	((100 * learner.unique_token_count) >= 	 (HASH_FULL * learner.max_tokens)) && grow_learner_hash() ) {      i = find_in_learner(id);      /* new i, go through all tests again */    }    if( i ) {      if( FILLEDP(i) ) {	/* redundant :-) */      } else if( /* !FILLEDP(i) && */		((100 * learner.unique_token_count) < 		 (HASH_FULL * learner.max_tokens) ) ) {	/* fill the hash and write to file */	SET(i->id,id);	if( learner.unique_token_count < K_TOKEN_COUNT_MAX )	  { learner.unique_token_count++; } else { overflow_warning = 1; }	i->order = r;	/* order accounting */	learner.max_order = (learner.max_order < i->order) ? 	  i->order : learner.max_order;	if( learner.fixed_order_unique_token_count[i->order] < K_TOKEN_COUNT_MAX ) 	  { learner.fixed_order_unique_token_count[i->order]++; } else 	    { overflow_warning = 1; }     	if( (options & (1<<OPTION_DEBUG)) ) {     	  fprintf(stdout, "match %s %d\n", tok, i->order);     	}	/* now save token to temporary file */	fputs(tok, learner.tmp);	fputc(-1, learner.tmp);      }      if( i->count < K_TOKEN_COUNT_MAX ) 	{ i->count++; } else { overflow_warning = 1; }      if( learner.full_token_count < K_TOKEN_COUNT_MAX )	{ learner.full_token_count++; } else 	  { /* this number is just cosmetic */ }      if( learner.fixed_order_token_count[i->order] < K_TOKEN_COUNT_MAX ) 	{ learner.fixed_order_token_count[i->order]++; } else 	  { skewed_constraints_warning = 1; }    }    if( digramic_overflow_warning ) {      return;    }    /* now update digram frequency counts */    if( r == 1 ) { /* count each token only once */      p = *tok++;      /* finally update character frequencies */      while( *tok ) {	q = (unsigned char)*tok;	if( learner.dig[p][q] < K_TOKEN_COUNT_MAX ) 	  { learner.dig[p][q]++; } else { digramic_overflow_warning = 1; }	p = q;	tok++;      }      if( digramic_overflow_warning ) {	fprintf(stderr, 		"warning: ran out of integers (too much data), "		"reference measure may be skewed.\n");      }    }    if( overflow_warning ) {      fprintf(stderr, 	      "warning: ran out of integers (too much data), "	      "results may be skewed.\n");    }  }}/* initialize global learner object */void init_learner() {  alphabet_size_t i, j;  regex_count_t c;  token_order_t z;  /* some constants */  learner.max_tokens = default_max_tokens;  learner.max_hash_bits = default_max_hash_bits;  learner.full_token_count = 0;  learner.unique_token_count = 0;  learner.logZ = 0.0;  learner.max_order = 0;  /* init character frequencies */  for(i = 0; i < ASIZE; i++) {     for(j = 0; j < ASIZE; j++) {       learner.dig[i][j] = 0.0;     }  }  /* open temporary file for writing tokens */  learner.tmp = tmpfile();  /* allocate room for hash */  learner.hash = calloc(learner.max_tokens, sizeof(l_item));  if( !learner.hash ) {    fprintf(stderr, 	    "error: not enough memory? I couldn't allocate %li bytes\n", 	    (sizeof(l_item) * ((long int)learner.max_tokens)));    exit(0);  }  for(c = 0; c < MAX_RE + 1; c++) {    learner.regex_token_count[c] = 0;  }  for(z = 0; z < MAX_SUBMATCH; z++) {    learner.fixed_order_token_count[z] = 0;    learner.fixed_order_unique_token_count[z] = 0;  }  if( options & (1<<OPTION_MBOX_FORMAT) ) {    reset_mbox_messages();  }}/* initializes to the Laplacian parameters */ void init_dirichlet() {    alphabet_size_t i;    dirichlet.alpha = 1.0;    for(i = 0; i < ASIZE; i++) {	dirichlet.u[i] = 1.0/((double)ASIZE);    }}/* calculates the most probable Dirichlet parameters    given digram counts. Method described in MacKay and Peto (1995)   Since we don't count the DIAMOND-DIAMOND digram, including the prior will    overestimate the DIAMOND-DIAMOND transition probability. However, for this    transition as well as the others, the Dirichlet will have practically   no effect when there's enough data in the corpus.*/void optimize_dirichlet() {  alphabet_size_t i, j;  token_count_t k;  float G[ASIZE];  float H[ASIZE];  float V[ASIZE];  float F[ASIZE];  double prev_alpha;  double K, t;  /* precompute useful constants */  for(j = 1; j < ASIZE; j++) {    G[j] = 0.0;    H[j] = 0.0;    V[j] = 0.0;    F[j] = 0.0;    for(i = 1; i < ASIZE; i++) {      for(k = 1; k < learner.dig[i][j]; k++) {	G[j] += 1.0/k;	H[j] += 1.0/((double)k*k);      }      if( learner.dig[i][j] > 0 ) { 	V[j]++; 	F[j] += learner.dig[i][j];      }    }    if(0 && options & (1<<OPTION_DEBUG) ) {       fprintf(stdout, 	      "symbol %d has G = %g H = %g V = %g F = %g\n", 	      j, G[j], H[j], V[j], F[j]);    }  }  /* now optimize the dirichlet parameters */  do {    prev_alpha = dirichlet.alpha;    K = 0.0;    for(i = 1; i < ASIZE; i++) {      K += log((F[i] + prev_alpha)/prev_alpha) +	(0.5) * F[i]/(prev_alpha * (F[i] + prev_alpha));    }    dirichlet.alpha = 0.0;    for(j = 1; j < ASIZE; j++) {      t = K - G[j];      dirichlet.u[j] = 2 * V[j] /(t + sqrt(t * t + 4 * H[j] * V[j]));      dirichlet.alpha += dirichlet.u[j];    }    if( options & (1<<OPTION_VERBOSE) ) {      fprintf(stdout, 	      "dirichlet alpha %g --> %g\n", 	      prev_alpha, dirichlet.alpha);    }  } while( fabs(prev_alpha - dirichlet.alpha) > TOL );    if(0 && options & (1<<OPTION_DEBUG) ) {    for(j = 1; j < ASIZE; j++) {      fprintf(stdout, 	      "dirichlet u[%d] =  %g\n", 	      j, dirichlet.u[j]);    }  }}/* computes the _logarithm_ of digram transition probabilities */void compute_digram_probabilities() {  alphabet_size_t i,j;  weight_t t;      for(i = 1; i < ASIZE; i++) {    t = 0.0;    for(j = 1; j < ASIZE; j++) {      t += learner.dig[i][j];    }    for(j = 1; j < ASIZE; j++) {      learner.dig[i][j] = 	log(((double)learner.dig[i][j] + dirichlet.u[j]) / 	    (t + dirichlet.alpha));      /* simulate the effect of digitizing the digrams */      learner.dig[i][j] = UNPACK_DIGRAMS(PACK_DIGRAMS(learner.dig[i][j]));       if(0 && options & (1<<OPTION_DEBUG) ) {	fprintf(stdout,		"learner.dig[%d][%d] = %f\n", 		i, j, learner.dig[i][j]);      }    }  }}weight_t calc_learner_digramic_excursion(char *tok) {  register alphabet_size_t p, q;  register weight_t t = 0.0;  /* now update digram frequency counts */  p = (unsigned char)*tok++;  /* finally update character frequencies */  while( *tok ) {    q = (unsigned char)*tok;    t += learner.dig[p][q];    p = q;    tok++;  }  return t;}/* calculates the rth-order divergence but needs normalizing constant    note: this isn't the full divergence from digref, just the bits needed    for the r-th optimization */score_t learner_divergence(score_t z, token_order_t r) {  register hash_count_t i;  register token_count_t c = 0;  score_t t = 0.0;  for(i = 0; i < learner.max_tokens; i++) {    if( FILLEDP(&learner.hash[i]) &&	(learner.hash[i].order == r) ) {      t += (learner.hash[i].lam) * learner.hash[i].count;      if( ++c >= learner.fixed_order_unique_token_count[r] ) {	c = 0;	break;      }    }  }  return -log(z)/((weight_t)r) +     t/((weight_t)learner.fixed_order_token_count[r]);}/* calculates the normalizing constant */score_t learner_Z(token_order_t r, score_t unchanging_part) {  register hash_count_t i;  register token_count_t c = 0;  score_t t = 0.0;  for(i = 0; i < learner.max_tokens; i++) {    if( FILLEDP(&learner.hash[i]) && 	(learner.hash[i].order == r) ) {      t += (exp(r * (learner.hash[i].lam)) - 1.0) * 	exp(r * UNPACK_LWEIGHTS(learner.hash[i].ltrms) +	    UNPACK_LWEIGHTS(learner.hash[i].dref));      if( ++c >= learner.fixed_order_unique_token_count[r] ) {	c = 0;	break;      }    }  }  /* investigate this someday */  if( isnan(t + unchanging_part) ) {    fprintf(stderr, "error: sorry, partition function went kaboom.\n");    exit(0);  }  return t + unchanging_part;}/* fills the hash with partial calculations */void fill_ref_vars(l_item *k, char *tok, token_order_t r) {  hash_value_t id;  char *t;  l_item *l;  /* weight of the r-th order excursion */  k->dref = PACK_LWEIGHTS(calc_learner_digramic_excursion(tok));  /* for each prefix of tok, add its weight */  k->ltrms = PACK_LWEIGHTS(0.0);  for( t = tok + strlen(tok) - 1; t > tok + 1; t-- ) {    if( *(t - 1) == DIAMOND ) {      *t = '\0';      id = (hash_value_t)hash((unsigned char *)tok, t - tok, 0);      l = find_in_learner(id); /* guaranteed to be found */      k->ltrms += PACK_LWEIGHTS(l->lam);    }  }}/* fills hash with partial calculations and returns the   unchanging part of the normalizing constant */score_t recalculate_reference_measure(token_order_t r) {  hash_value_t id;  char buf[BUFLEN];  char tok[MAX_TOKEN_LEN];  char *p, *q;  l_item *k;  score_t partial_z = 1.0;  /* now we calculate the logarithmic word weight     from the digram model, for each token in the hash */  rewind(learner.tmp);  q = tok;  while( !feof(learner.tmp) && fgets(buf, BUFLEN, learner.tmp) ) {    p = buf;    while( *p ) {      if( *p != -1) {	*q++ = *p; /* copy into tok */      } else { /* interword space */ 	*q = 0; /* append NUL to tok */	/* now write weight in hash */	id = (hash_value_t)hash((unsigned char *)tok, strlen(tok), 0);	k = find_in_learner(id); /* guaranteed to be found */	fill_ref_vars(k, tok, r);	/* now calculate the unchanging part of the normalizing constant */	if(k->order < r) {	  partial_z += (exp(r * (k->lam)) - 1.0) * 	    exp(r * UNPACK_LWEIGHTS(k->ltrms) + 		UNPACK_LWEIGHTS(k->dref));	}	q = tok; /* reset q */      }      p++;
💿 文件大小 349 K
👤 上传用户 AhQ
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#digramic #dbacl #贝叶斯 #文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -