📄 bayesol.c

📁 dbacl是一个通用目的的digramic贝叶斯文本分类器。它可以学习你提供的文本
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
/* reads a text file as input, and applies several   filters. */void process_file(FILE *input, 		  void (*line_fun)(void)) {  char *s;  category_count_t i;  charbuf_len_t k;  charbuf_len_t l;  RegMatch *r;  regmatch_t pmatch[MAX_SUBMATCH];  submatch_order_t z;#if defined HAVE_LIBBOOST_REGEX  charbuf_len_t rr;#endif   /* now start processing */  while( !feof(input) ) {    /* read in a full line, allocating memory as necessary */    textbuf[0] = '\0';    s = textbuf;    l = textbuf_len;    k = 1;    while( fgets(s, l, input) && (strlen(s) >= (l - 1)) ) {      textbuf = realloc(textbuf, 2 * textbuf_len);      if( !textbuf ) {	fprintf(stderr, 		"error: not enough memory for input line (%d bytes)\n",		textbuf_len);	exit(0);      }      s = textbuf + textbuf_len - (k++);      l = textbuf_len;      textbuf_len *= 2;    }    /* now summarize this line if required */    if( line_fun ) { (*line_fun)(); }    if( (textbuf[0] == 's') && 	(strncmp(MAGIC, textbuf, 7) == 0) ) {      if( !parse_dbacl_scores(textbuf) ) {	fprintf(stderr, "error: scores don't match risk specification\n");	exit(0);      } else if( options & (1<<OPTION_DEBUG) ) {	for(i = 0; i < spec.num_cats; i++) {	  fprintf(stdout, 		  "category %s\t cross_entropy %7.2f complexity %7.0f\n",		  spec.catname[i], spec.cross_entropy[i], spec.complexity[i]);	}	fprintf(stdout, "\n");      }    } else {      /* for each regex in our list, try for a match */#if defined HAVE_LIBBOOST_REGEX      /* boost regexes need a wide char string */      /* now convert the line into a wide character string */      if( textbuf_len > wc_textbuf_len ) {	wc_textbuf_len = textbuf_len;	wc_textbuf = realloc(wc_textbuf, wc_textbuf_len * sizeof(wchar_t));	if( !wc_textbuf ) {	  fprintf(stderr, 		  "error: not enough memory for wide character conversion "		  "(%d bytes)\n",		  wc_textbuf_len * sizeof(wchar_t));	  exit(0);	}      }      rr = mbstowcs(wc_textbuf, textbuf, wc_textbuf_len);      if( rr <= 0 && (strlen(textbuf) > 0) ) {	fprintf(stderr, 		"error: couldn't convert an input string for matching, ignoring regexes\n");      } else {	wc_textbuf[rr] = L'\0';		for( r = spec.regs; r != 0; r = r->next) {	  if( regexec(&(r->reg), wc_textbuf, MAX_SUBMATCH, pmatch, 0) == 0 ) {	    r->lv->found = 1;	    /* convert each submatch to a number - pad remaining	       elements to zero */	    for(z = 1; z < MAX_SUBMATCH; z++) {	      if(pmatch[z].rm_so > -1) {		r->lv->sm[z-1] = wcstod(wc_textbuf + pmatch[z].rm_so, NULL);	      } else {		r->lv->sm[z-1] = 0.0;	      }	    }	    if( options & (1<<OPTION_DEBUG) ) {	      fprintf(stdout, 		      "match \"%s\"", r->lv->re);	      for(z = 1; (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) {		fprintf(stdout,			" %f", r->lv->sm[z-1]);	      }	      fprintf(stdout, "\n");	    }	  }	}      }#else      /* GNU regexes expect an ordinary string */      for( r = spec.regs; r != 0; r = r->next) {	if( regexec(&(r->reg), textbuf, MAX_SUBMATCH, pmatch, 0) == 0 ) {	  r->lv->found = 1;	  /* convert each submatch to a number - pad remaining	     elements to zero */	  for(z = 1; z < MAX_SUBMATCH; z++) {	    if(pmatch[z].rm_so > -1) {	      r->lv->sm[z-1] = strtod(textbuf + pmatch[z].rm_so, NULL);	    } else {	      r->lv->sm[z-1] = 0.0;	    }	  }	  if( options & (1<<OPTION_DEBUG) ) {	    fprintf(stdout, 		    "match \"%s\"", r->lv->re);	    for(z = 1; (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) {	      fprintf(stdout,		      " %f", r->lv->sm[z-1]);	    }	    fprintf(stdout, "\n");	  }	}      }#endif    }  }}/*********************************************************** * WIDE CHARACTER FILE HANDLING FUNCTIONS                  * * this is needed for any locale whose character set       * * encoding can include NUL bytes inside characters        * *                                                         * * Actually, at present this is quite useless. But it might* * prove handy in the future.                              * ***********************************************************/#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H/* reads a text file as input, converting each line   into a wide character representation and applies several   filters. */void w_process_file(FILE *input, 		    void (*line_fun)(void)) {  char *s;  charbuf_len_t k;  charbuf_len_t l;  charbuf_len_t wclen;  wchar_t *wp;  mbstate_t input_shiftstate;  category_count_t i;  RegMatch *r;  regmatch_t pmatch[MAX_SUBMATCH];  submatch_order_t z;  memset(&input_shiftstate, 0, sizeof(mbstate_t));  while( !feof(input) ) {    /* read in a full line, allocating memory as necessary */    textbuf[0] = '\0';    s = textbuf;    l = textbuf_len;    k = 1;    while( fgets(s, l, input) && (strlen(s) >= (l - 1)) ) {      textbuf = realloc(textbuf, 2 * textbuf_len);      if( !textbuf ) {	fprintf(stderr, 		"error: not enough memory for input line (%d bytes)\n",		textbuf_len);	exit(0);      }      s = textbuf + textbuf_len - (k++);      l = textbuf_len;      textbuf_len *= 2;    }    /* now convert the line into a wide character string */    if( textbuf_len > wc_textbuf_len ) {      wc_textbuf_len = textbuf_len;      wc_textbuf = realloc(wc_textbuf, wc_textbuf_len * sizeof(wchar_t));      if( !wc_textbuf ) {	fprintf(stderr, 		"error: not enough memory for wide character conversion "		"(%ld bytes)\n",		(long int)(wc_textbuf_len * sizeof(wchar_t)));	exit(0);      }    }    /* convert as much as we can of the line into wide characters */    s = textbuf;    k = textbuf_len;    wp = wc_textbuf;    wclen = 0;    /* since we ensured textbuf_len <= wctextbuf_len       there will never be overflow of wctextbuf below */    while( k > 0 ) {      l = mbrtowc(wp, s, k, &input_shiftstate);      if( l > 0 ) {	wp++;	wclen++;	k -= l;	s += l;      } else if( l == 0 ) {	break;      } else if( l == -1 ) {	/* try to be robust */	s++; 	k--;	memset(&input_shiftstate, 0, sizeof(mbstate_t));      } else if( l == -2) {	/* couldn't parse a complete character */	break;      }    }    *wp = L'\0';    /* now summarize this line if required */    if( line_fun ) { (*line_fun)(); }    /* the scores are written by dbacl, so there's no need for the conversion */    if( (textbuf[0] == 's') && 	(strncmp(MAGIC, textbuf, 7) == 0) ) {      if( !parse_dbacl_scores(textbuf) ) {	fprintf(stderr, "error: scores don't match risk specification\n");	exit(0);      } else if( options & (1<<OPTION_DEBUG) ) {	for(i = 0; i < spec.num_cats; i++) {	  fprintf(stdout, 		  "category %s\t cross_entropy %7.2f complexity %7.0f\n",		  spec.catname[i], spec.cross_entropy[i], spec.complexity[i]);	}	fprintf(stdout, "\n");      }    } else {      /* for each regex in our list, try for a match */#if defined HAVE_LIBBOOST_REGEX      /* boost regexes need a wide char string */            for( r = spec.regs; r != 0; r = r->next) {	if( regexec(&(r->reg), wc_textbuf, MAX_SUBMATCH, pmatch, 0) == 0 ) {	  r->lv->found = 1;	  /* convert each submatch to a number - pad remaining	     elements to zero */	  for(z = 1; z < MAX_SUBMATCH; z++) {	    if(pmatch[z].rm_so > -1) {	      r->lv->sm[z-1] = wcstod(wc_textbuf + pmatch[z].rm_so, NULL);	    } else {	      r->lv->sm[z-1] = 0.0;	    }	  }	  if( options & (1<<OPTION_DEBUG) ) {	    fprintf(stdout, 		    "match \"%s\"", r->lv->re);	    for(z = 1; (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) {	      fprintf(stdout,		      " %f", r->lv->sm[z-1]);	    }	    fprintf(stdout, "\n");	  }	}      }#else      /* GNU regexes expect an ordinary string */      for( r = spec.regs; r != 0; r = r->next) {	if( regexec(&(r->reg), textbuf, MAX_SUBMATCH, pmatch, 0) == 0 ) {	  r->lv->found = 1;	  /* convert each submatch to a number - pad remaining	     elements to zero */	  for(z = 1; z < MAX_SUBMATCH; z++) {	    if(pmatch[z].rm_so > -1) {	      r->lv->sm[z-1] = strtod(textbuf + pmatch[z].rm_so, NULL);	    } else {	      r->lv->sm[z-1] = 0.0;	    }	  }	  if( options & (1<<OPTION_DEBUG) ) {	    fprintf(stdout, 		    "match \"%s\"", r->lv->re);	    for(z = 1; (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) {	      fprintf(stdout,		      " %f", r->lv->sm[z-1]);	    }	    fprintf(stdout, "\n");	  }	}      }#endif    }  }}#endif /* HAVE_WCHAR_H *//*********************************************************** * MAIN FUNCTIONS                                          * ***********************************************************/char *sanitize_path(char *in) {  char *q;  char *path;  charbuf_len_t l;  /* this bit likely fails in DOS ;-) */  if( (*in != '/') && (*in != '.') && (path = getenv("DBACL_PATH")) ) {    l = strlen(path);    q = malloc(l + strlen(in) + 3);    strcpy(q, path);    if( q[l - 1] != '/' ) {      q[l] = '/';       q[l + 1] = 0;    }    strcat(q, in);    return q;  } else {    return in;  }}int main(int argc, char **argv) {  FILE *input;  char op;  void (*preprocess_fun)(void) = NULL;  void (*line_fun)(void) = NULL;  void (*postprocess_fun)(void) = NULL;  /* set up internationalization */  if( !setlocale(LC_ALL, "") ) {    fprintf(stderr, 	    "warning: could not set locale, internationalization disabled\n");  } else {    if( options & (1<<OPTION_DEBUG) ) {      fprintf(stderr, 	      "warning: international locales not supported\n");    }  }  /* parse the options */  while( (op = getopt(argc, argv, "DVvinc:")) > -1 ) {    switch(op) {    case 'V':      fprintf(stdout, "bayesol version %s\n", VERSION);#if defined HAVE_LIBBOOST_REGEX      fprintf(stdout, "Using BOOST wide character modern regexes.\n");#elif defined __GNUC__      fprintf(stdout, "Using GNU modern regexes.\n");#else      fprintf(stdout, "Using system regexes.\n");#endif      exit(1);      break;    case 'n':      options |= (1<<OPTION_SCORES);      break;    case 'i':      options |= (1<<OPTION_I18N);#if !defined HAVE_WCHAR_H || !defined HAVE_WCTYPE_H      fprintf(stderr, "warning: this tool has been compiled without wide character support. Full internationalization is disabled.\n");      options &= ~(1<<OPTION_I18N);#endif      break;    case 'c':      if( *optarg && read_riskspec(optarg) ) {	options |= (1<<OPTION_RISKSPEC);      } else {	fprintf(stderr, "error: could not read %s, program aborted\n", optarg); 	exit(0);      }      break;    case 'v':      options |= (1<<OPTION_VERBOSE);      break;    case 'D':      options |= (1<<OPTION_DEBUG);      break;    default:      break;    }  }  /* end option processing */      /* consistency checks */  if( !(options & (1<<OPTION_RISKSPEC)) ){    fprintf(stderr, 	    "error: please use -c option\n");    usage(argv);    exit(0);  }  if( options & (1<<OPTION_I18N) ) {#if defined HAVE_LIBBOOST_REGEX#else    fprintf(stderr, 	    "warning: regexes operate in multibyte encoding.");#endif	  }  /* set up callbacks */  if( options & (1<<OPTION_RISKSPEC) ) {    preprocess_fun = setup_regexes;    line_fun = NULL; /* print_line; */    postprocess_fun = finish_parsing_and_score;  } else { /* something wrong ? */    usage(argv);    exit(0);  }  if( preprocess_fun ) { (*preprocess_fun)(); }  /* preallocate primary text holding buffer */  textbuf_len = BUFLEN;  textbuf = malloc(textbuf_len);  /* now process each file on the command line,     or if none provided read stdin */  while( (optind > -1) && *(argv + optind) ) {    /* if it's a filename, process it */    if( (input = fopen(argv[optind], "r")) ) {      options |= (1<<INPUT_FROM_CMDLINE);      if( options & (1<<OPTION_DEBUG) ) {	fprintf(stdout, "processing file %s\n", argv[optind]);      }      if( !(options & (1<<OPTION_I18N)) ) {	process_file(input, line_fun);      } else {#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H	w_process_file(input, line_fun);#endif      }      fclose(input);    } else { /* unrecognized file name */      fprintf(stderr, "error: couldn't open %s\n", argv[optind]);      usage(argv);      exit(0);    }    optind++;  }  /* in case no files were specified, get input from stdin */  if( !(options & (1<<INPUT_FROM_CMDLINE)) ) {    if( options & (1<<OPTION_DEBUG) ) {      fprintf(stdout, "taking input from stdin\n");    }    if( !(options & (1<<OPTION_I18N)) ) {      process_file(stdin, line_fun);    } else {#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H      w_process_file(stdin, line_fun);#endif    }  }    if( postprocess_fun ) { (*postprocess_fun)(); }  /* free some global resources */  free(textbuf);  exit(exit_code);}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -