📄 catfun.c

📁 dbacl是一个通用目的的digramic贝叶斯文本分类器。它可以学习你提供的文本
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
 		      "warning: empirical hash full, calculation may be skewed. "  		      "Try option -h %d\n",  		      (empirical.max_hash_bits + 1)); 	      hashfull_warning = 1; 	    } 	    return; /* pretend word doesn't exist */ 	  }	  if( empirical.track_features ) {	    if( empirical.feature_stack_top < MAX_TOKEN_LINE_STACK ) {	      empirical.feature_stack[empirical.feature_stack_top++] = h;	    } else {	      empirical.track_features = 0;	      empirical.feature_stack_top = 0;	    }	  } 	}	empirical.full_token_count += 	  ( empirical.full_token_count < K_TOKEN_COUNT_MAX ) ? 1 : 0;	      }    }    /* now do scoring for all available categories */    for(i = 0; i < cat_count; i++) {      switch(cat[i].model_type) {      case simple:	/* see if this is for us */	if( ((re == 0) && (r == 1)) ||	    ((re > 0) && (cat[i].retype & (1<<(re-1)))) ) {	  /* if token found, add its lambda weight */	  k = find_in_category(&cat[i], id);	  if( k ) {	    cat[i].score += UNPACK_LAMBDA(k->lam);	  }	  /* now compute the reference weight from digram model */	  pp = (unsigned char)*tok;	  q = tok + 1;	  while( *q ) {	    if( *q == '\r' ) {	      q++;	      continue;	    }	    pc = (unsigned char)*q;	    cat[i].score += UNPACK_DIGRAMS(cat[i].dig[pp][pc]);	    pp = pc;	    q++;	  }	  /* don't forget the normalizing constant */	  cat[i].score -= cat[i].logZ;      	  /* update complexity for this category */	  cat[i].complexity++;	  /* add correction factor for multinomial case */	  if(h) {	    cat[i].score += log((weight_t)cat[i].complexity) - 	      log((weight_t)h->count);	  }	}	break;      case sequential:      default:	/* see if this is for us */	if( ((re == 0) && !cat[i].retype) ||	    ((re > 0) && (cat[i].retype & (1<<(re-1)))) ) {	  /* if token found, add its lambda weight */	  k = find_in_category(&cat[i], id);	  if( k ) {	    cat[i].score += UNPACK_LAMBDA(k->lam);	  }	  if( r == 1 ) {	    /* now compute the reference weight from digram model */	    pp = (unsigned char)*tok;	    q = tok + 1;	    while( *q ) {	      if( *q == '\r' ) {		q++;		continue;	      }	      pc = (unsigned char)*q;	      cat[i].score += UNPACK_DIGRAMS(cat[i].dig[pp][pc]);	      pp = pc;	      q++;	    }	  }	  if( r == cat[i].max_order ) {	    /* don't forget the normalizing constant */	    cat[i].score -= cat[i].logZ;      	    /* update complexity for this category */	    cat[i].complexity++;	  }	}	break;      }    }  }}/*********************************************************** * FILE MANAGEMENT FUNCTIONS                               * ***********************************************************//* loads a category hash */error_code_t load_category(Category *cat) {  hash_count_t i, j;  regex_count_t c;  char buf[MAGIC_BUFSIZE];  char scratchbuf[MAGIC_BUFSIZE];  char *p;  short int shint_val;  long int lint_val1, lint_val2, lint_val3;  FILE *input;#if defined HAVE_LIBBOOST_REGEX  charbuf_len_t r;#endif  if( (input = fopen(cat->filename, "rb")) ) {    fgets(buf, MAGIC_BUFSIZE, input);    if( strncmp(buf, MAGIC1, 16 + strlen(VERSION)) ) {      fprintf(stderr, 	      "error: not a dbacl version "VERSION" category file [%s]\n", 	      cat->filename);      return 0;    }     init_category(cat); /* changes filename */    fgets(buf, MAGIC_BUFSIZE, input);    sscanf(buf, MAGIC2_i, &cat->divergence, &cat->logZ, 	   &shint_val, scratchbuf);    cat->max_order = (token_order_t)shint_val;    if( scratchbuf[0] == 'm' ) {      cat->model_type = simple;    } else {      cat->model_type = sequential;    }    fgets(buf, MAGIC_BUFSIZE, input);    sscanf(buf, MAGIC3, 	   &shint_val,	   &lint_val1,	   &lint_val2,	   &lint_val3);    cat->max_hash_bits = (token_order_t)shint_val;    cat->model_full_token_count = (token_count_t)lint_val1;    cat->model_unique_token_count = (token_count_t)lint_val2;    cat->model_num_docs = (document_count_t)lint_val3;    cat->max_tokens = (1<<cat->max_hash_bits);    /* see if there are any regexes */    fgets(buf, MAGIC_BUFSIZE, input);    while(1) {      if( strncmp(buf, MAGIC6, 2) == 0 ) {	break;      } else if( strncmp(buf, MAGIC5_i, 8) == 0 ) {	/* set up the submatch bitmap */	re[regex_count].submatches |= 0;	if( (p = strrchr(buf + RESTARTPOS, '|')) && ( *(--p) == '|') ) {	  /* assume string ends in ||12345, use as bitmap */	  *p = '\0';	  for(p += 2; *p; p++) {	    /* assume ascii number positions */	    if( (*p > '9') || (*p < '1')) {	      if( *p != '\n' ) {		fprintf(stderr, 			"warning: could not decode || suffix for %s while loading %s\n", 			buf + RESTARTPOS, cat->filename);	      }	    } else {	      re[regex_count].submatches |= (1<<(*p - '0'));	    }	  }	} else { /* no bitmap specified */	  re[regex_count].submatches = ~0;	}#if defined HAVE_LIBBOOST_REGEX	/* boost regexes accept wide characters */	    	r = strlen(buf + RESTARTPOS);	re[regex_count].string = malloc((r + 1) * sizeof(wchar_t));	/* note: skip the trailing newline */	r = mbstowcs(re[regex_count].string, buf + RESTARTPOS, r - 1); 	if( r <= 0 ) {	  fprintf(stderr, 		  "error: couldn't convert regular expression '%s'.\n",		  buf + RESTARTPOS);	  exit(0); /* no point in going on */	} else {	  re[regex_count].string[r] = L'\0';	}	/* now see if we already have this string */	for(c = 0; c < regex_count; c++) {	  if( wcscmp(re[c].string, re[regex_count].string) == 0 ) {	    break;	  }	}	if( c < regex_count ) {	  /* we found this regex, no need to repeat it */	  free(re[regex_count].string);	} else {	  /* compile the regex (uses regcompW) */	  if( regcomp(&re[regex_count].regex,		      re[regex_count].string, REG_EXTENDED) != 0 ) {	    fprintf(stderr, 		    "error: could not compile regular expression '%ls' for %s.\n", 		    re[regex_count].string, cat->filename);	    exit(0);	  } else {	    regex_count++;	    if( regex_count >= MAX_RE ) { 	      fprintf(stderr, "error: too many regular expressions\n");	      exit(0); /* no point in going on */	    }	  }	}#else	/* remove trailing newline */	if( buf[strlen(buf) - 1] == '\n' ) { 	  buf[strlen(buf) - 1] = '\0'; 	}	/* GNU regexes use ordinary strings */	/* it's a regex - see if we've got it already */	for(c = 0; c < regex_count; c++) {	  if( strcmp(re[c].string, buf + RESTARTPOS) == 0 ) {	    break;	  }	}	if( c >= regex_count ) { /* not found */	  /* add it to our list */	  re[regex_count].string = strdup(buf + RESTARTPOS);	  /* and compile the regex */	  if( regcomp(&re[regex_count].regex, 		      re[regex_count].string, REG_EXTENDED) != 0 ) {	    fprintf(stderr, 		    "error: could not compile regular expression '%s' for %s.\n", 		    re[regex_count].string, cat->filename);	    exit(0); /* not much point going on */	  } else {	    regex_count++;	    if( regex_count >= MAX_RE ) { 	      fprintf(stderr, "error: too many regular expressions\n");	      exit(0); /* no point in going on */	    }	  }	}#endif	/* now flag the regex as taken */	cat->retype |= (1<<c);      } else if( strncmp(buf, MAGIC4, 10) == 0) {	/* it's the case sensitive flag */	options |= (1<<OPTION_CASEN);      } else if( strncmp(buf, MAGIC7, 5) == 0) {	options |= (1<<OPTION_I18N);      }      /* finished with current line, get next one */      fgets(buf, MAGIC_BUFSIZE, input);     }    /* if this category did not register a regex, it wants       the default processing, so we flag this */    if( !cat->retype ) {      options |= (1<<OPTION_NOREGEX);    }    /* read character frequencies */    fread(cat->dig, SIZEOF_DIGRAMS, ASIZE * ASIZE, input);    /* allocate hash table */    if( !(cat->hash = malloc(sizeof(c_item) * cat->max_tokens)) ) {      fprintf(stderr, "error: not enough memory for category %s\n", 	      cat->filename);      return 0;    }    /* read in hash table */    i = cat->max_tokens;    j = 0;    while( j < i ) {      j += fread(cat->hash + j, sizeof(c_item), i, input);    }    fclose(input);  } else {    fprintf(stderr, "error: cannot open file for reading %s\n", 	    cat->filename);    return 0;  }  return 1;}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -