📄 dbacl.c

📁 dbacl是一个通用目的的digramic贝叶斯文本分类器。它可以学习你提供的文本
💻 C
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
#if defined HAVE_LIBBOOST_REGEX      fprintf(stdout, "Using BOOST wide character modern regexes.\n");#elif defined __GNUC__      fprintf(stdout, "Using GNU modern regexes.\n");#else      fprintf(stdout, "Using system regexes.\n");#endif      fprintf(stdout, "Feature memory requirements: %d bytes (classifying), %d bytes (learning)\n", 	      (int)sizeof(c_item), (int)sizeof(l_item));      exit(1);      break;    case 'd':      options |= (1<<OPTION_DUMP);      break;    case 'h': /* select memory size in powers of 2 */      default_max_hash_bits = strtod(optarg, NULL);      if( default_max_hash_bits > MAX_HASH_BITS ) {	fprintf(stderr, 		"warning: maximum hash size will be 2^%d\n", 		MAX_HASH_BITS);	default_max_hash_bits = MAX_HASH_BITS;      }      default_max_tokens = (1<<default_max_hash_bits);      break;    case 'H': /* select memory size in powers of 2 */      default_max_grow_hash_bits = strtod(optarg, NULL);      if( default_max_grow_hash_bits > MAX_HASH_BITS ) {	fprintf(stderr, 		"warning: maximum hash size will be 2^%d\n", 		MAX_HASH_BITS);	default_max_grow_hash_bits = MAX_HASH_BITS;      }      default_max_grow_tokens = (1<<default_max_grow_hash_bits);      options |= (1<<OPTION_GROWHASH);      break;    case 'j':      options |= (1<<OPTION_CASEN);      break;    case 'n':      options |= (1<<OPTION_SCORES);      break;    case 'c':      if( cat_count >= MAX_CAT ) {	fprintf(stderr, 		"warning: maximum reached, category ignored\n");      } else if( options & (1<<OPTION_LEARN) ) {	fprintf(stderr, 		"error: cannot use options -l and -c together\n");	usage(argv);	exit(0);      } else {	options |= (1<<OPTION_CLASSIFY);	cat[cat_count].filename = sanitize_path(optarg);	if( !*optarg ) {	  fprintf(stderr, "error: category needs a name\n");	  usage(argv);	  exit(0);	}	if( !load_category(&cat[cat_count]) ) {	  fprintf(stderr, 		  "error: could not load category %s\n", 		  cat[cat_count].filename);	  exit(0);	}	ngram_order = (ngram_order < cat[cat_count].max_order) ? 	  cat[cat_count].max_order : ngram_order;	cat_count++;      }      break;    case 'r':      options |= (1<<OPTION_REFMODEL);      break;    case 'w':      ngram_order = atoi(optarg);      if( !*optarg || (ngram_order < 1) || (ngram_order > 9) ) {	fprintf(stderr, 		"error: the -w switch needs a number between 1 and 9\n");	exit(0);      }      break;    case 'x':      decimation = atoi(optarg);      if( !*optarg || (decimation < 1) || (decimation > MAX_HASH_BITS) ) {	fprintf(stderr, 		"warning: option -x ignored, needs an integer between 1 and %d\n", 		MAX_HASH_BITS);      } else {	options |= (1<<OPTION_DECIMATE);      }      break;    case 'f':      if( filter_count >= MAX_CAT ) {	fprintf(stderr, "warning: maximum reached, filter ignored\n");      } else if( options & (1<<OPTION_LEARN) ) {	fprintf(stderr, "error: cannot use options -l and -f together\n");	usage(argv);	exit(0);      } else if( !*optarg ) {	fprintf(stderr, "error: filter must be category name or number\n");	usage(argv);	exit(0);      } else {	options |= (1<<OPTION_FILTER);	filter[filter_count] = -1;	/* see if it's a known category */	for(c = 0; c < cat_count; c++) {	  if( !strcmp(cat[c].filename, optarg) ) {	    filter[filter_count] = c;	    break;	  }	}	/* if not recognized, see if it's a number */	if( filter[filter_count] < 0 ) {	  filter[filter_count] = strtod(optarg, NULL) - 1;	}	if( filter[filter_count] < 0 ) { /* still not recognized */	  fprintf(stderr, 		  "error: unrecognized category in -f option [%s]\n", 		  optarg);	  usage(argv);	  exit(0);	}	filter_count++;      }      break;    case 'v':      options |= (1<<OPTION_VERBOSE);      break;    case 'l':       if( options & (1<<OPTION_CLASSIFY) ) {	fprintf(stderr, 		"error: cannot use options -l and -c together\n");	usage(argv);	exit(0);      } else if( options & (1<<OPTION_LEARN) ) {	fprintf(stderr, 		"error: option -l can only occur once\n");	exit(0);      } else {	options |= (1<<OPTION_LEARN);	learner.filename = sanitize_path(optarg);	if( !*learner.filename ) {	  fprintf(stderr, 		  "error: category needs a name\n");	  usage(argv);	  exit(0);	}      }      break;    case 'g':      if( options & (1<<OPTION_CLASSIFY) ) {	fprintf(stderr, "error: must use option -l together with -F\n");	usage(argv);	exit(0);      } else {	/* set up the submatch bitmap */	re[regex_count].submatches |= 0;	if( (p = strrchr(optarg, '|')) && ( *(--p) == '|') ) {	  /* assume string ents in ||12345, use as bitmap */	  *p = '\0';	  for(p += 2; *p; p++) {	    /* assume ascii number positions */	    if( !isdigit(*p) || (*p > '9') || (*p < '1')) {	      fprintf(stderr, 		      "warning: could not decode bitmap for %s\n", optarg);	    } else {	      re[regex_count].submatches |= (1<<(*p - '0'));	    }	  }	} else { /* no bitmap specified */	  re[regex_count].submatches = ~0;	}#if defined HAVE_LIBBOOST_REGEX	/* BOOST regexes must be wide */	if( !(rstring = malloc((strlen(optarg) + 1) * sizeof(wchar_t))) ) {	  fprintf(stderr, 		  "warning: could not prepare regular expression '%s', ignored\n",		  optarg);	} else if( mbstowcs(rstring, optarg, strlen(optarg)) < 0 ) {	  fprintf(stderr,		  "warning: could not convert regular expression '%s', ignored\n",		  optarg);	} else {	  rstring[strlen(optarg)] = L'\0';	  if( regcomp(&re[regex_count].regex, rstring, REG_EXTENDED) != 0 ) {	    fprintf(stderr, 		    "warning: could not compile regular expression '%s', ignored\n", 		    optarg);	  } else {	    re[regex_count].string = rstring;	    regex_count++;	  }	}#else	/* GNU regexes use regular strings */	if( regcomp(&re[regex_count].regex, optarg, REG_EXTENDED) != 0 ) {	  fprintf(stderr, 		  "warning: could not compile regular expression '%s', ignored\n", 		  optarg);	} else {	  re[regex_count].string = optarg;	  regex_count++;	}#endif      }      break;    case 'i':      options |= (1<<OPTION_I18N);#if defined HAVE_LANGINFO_H      if( !strcmp(nl_langinfo(CODESET), "UTF-8") ) {	fprintf(stderr, "warning: you have UTF-8, so -i is not needed.\n");      }#endif#if !defined HAVE_WCHAR_H || !defined HAVE_WCTYPE_H      fprintf(stderr, "warning: this tool was compiled without wide character support. Full internationalization is disabled.\n");      options &= ~(1<<OPTION_I18N);#endif      break;    default:      break;    }  }  /* end option processing */      /* consistency checks */  if( ((options>>OPTION_CLASSIFY) & 1) +       ((options>>OPTION_LEARN) & 1) != 1 ) {    fprintf(stderr, 	    "error: please use either -c or -l option\n");    usage(argv);    exit(0);  }  if( (options & (1<<OPTION_DECIMATE)) &&      !(options & (1<<OPTION_LEARN)) ) {    fprintf(stderr,	    "warning: option -x ignored, applies only when learning\n");    options &= ~(1<<OPTION_DECIMATE);  }  if( options & (1<<OPTION_DUMP) ) {    if( options & (1<<OPTION_CLASSIFY) ) {      options &= ~(1<<OPTION_DUMP); /* no sense */    } else if( options & (1<<OPTION_VERBOSE) ) {      options &= ~(1<<OPTION_VERBOSE); /* verbose writes garbage to stdout */      options &= ~(1<<OPTION_DEBUG);    }  }  if( ((options>>OPTION_TEXT_FORMAT) & 1) +       ((options>>OPTION_MBOX_FORMAT) & 1) > 1 ) {    fprintf(stderr, 	    "error: please use one of either -T text or -T email options\n");    usage(argv);    exit(0);  }  if( (options & (1<<OPTION_APPEND)) &&      (options & (1<<OPTION_FILTER)) ) {    options &= ~(1<<OPTION_APPEND);    fprintf(stderr, 	    "warning: disabling option -a, because it cannot be used with -f\n");  }  /* decide if we need some options */  if( !regex_count ) {    options |= (1<<OPTION_NOREGEX);  }  if( options & (1<<OPTION_MULTINOMIAL) ) {     if( cat_count == 1 ) {      if( cat[0].model_type == simple ) {	options |= (1<<OPTION_CALCENTROPY);      }    } else if( cat_count > 1 ) {      for(c = 1; c < cat_count; c++) {	if( cat[c].model_type == sequential ) { break; }	if( cat[c].retype != cat[c-1].retype ) { break; }      }      if( c == cat_count ) {	options |= (1<<OPTION_CALCENTROPY);      }    }    if( !(options & (1<<OPTION_CALCENTROPY)) ) {      fprintf(stderr,	      "warning: -M switch ignored. "	      "Not all categories support multinomial calculations\n");    }  }  if( options & (1<<OPTION_I18N) ) {#if defined HAVE_LIBBOOST_REGEX#else    fprintf(stderr, 	    "warning: regexes operate in multibyte encoding.\n");#endif	  }  /* set up callbacks */  if( options & (1<<OPTION_CLASSIFY) ) {    if( options & (1<<OPTION_CALCENTROPY) ) {      init_empirical(&empirical, 		     default_max_tokens, 		     default_max_hash_bits); /* sets cached to zero */    }    if( cat_count == 1 ) {      /* single category */      preprocess_fun = NULL;      word_fun = score_word;      if( options & (1<<OPTION_FILTER) ) {	options |= (1<<OPTION_FASTEMP);	empirical.track_features = 1; 	post_line_fun = line_score_single_category;	postprocess_fun = NULL;      } else {	post_line_fun = NULL;	postprocess_fun = score_single_category;      }    } else {      /* multiple categories */      preprocess_fun = NULL;      word_fun = score_word;      if( options & (1<<OPTION_FILTER) ) {	options |= (1<<OPTION_FASTEMP);	empirical.track_features = 1; 	post_line_fun = line_score_multiple_categories;	postprocess_fun = NULL;      } else {	post_line_fun = NULL;	postprocess_fun = score_multiple_categories;      }    }  } else if( options & (1<<OPTION_LEARN) ) {    /* category learning */    preprocess_fun = init_learner;    word_fun = hash_word_and_learn;    if( options & (1<<OPTION_MBOX_FORMAT) ) {      post_line_fun = count_mbox_messages;    } else {      post_line_fun = NULL;    }    postprocess_fun = optimize_learner_and_save;  } else { /* something wrong ? */    usage(argv);    exit(0);  }  /* handles some common filtering options */  if( (options & (1<<OPTION_INDENTED)) ||      (options & (1<<OPTION_APPEND)) ) {    pre_line_fun = handle_indents_and_appends;  }  if( preprocess_fun ) { (*preprocess_fun)(); }  init_file_handling();  /* now process each file on the command line,     or if none provided read stdin */  while( (optind > -1) && *(argv + optind) ) {    /* if it's a filename, process it */    if( (input = fopen(argv[optind], "r")) ) {      options |= (1<<INPUT_FROM_CMDLINE);      if( (options & (1<<OPTION_VERBOSE)) && 	  !(options & (1<<OPTION_CLASSIFY))) {	fprintf(stdout, "processing file %s\n", argv[optind]);      }      /* set some initial options */      reset_xml_character_filter();      if( options & (1<<OPTION_MBOX_FORMAT) ) {	reset_mbox_line_filter();	if( !(options & (1<<OPTION_I18N)) ) {	  process_file(input, mbox_line_filter, 		       ((options & (1<<OPTION_XML)) ? xml_character_filter : NULL), 		       word_fun, pre_line_fun, post_line_fun);	} else {#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H	  w_process_file(input, w_mbox_line_filter, 			 ((options & (1<<OPTION_XML)) ? w_xml_character_filter : NULL), 			 word_fun, pre_line_fun, post_line_fun);#endif	}      } else { 	/* default to text */	if( !(options & (1<<OPTION_I18N)) ) {	  process_file(input, NULL, 		       ((options & (1<<OPTION_XML)) ? xml_character_filter : NULL), 		       word_fun, pre_line_fun, post_line_fun);	} else {#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H	  w_process_file(input, NULL, 			 ((options & (1<<OPTION_XML)) ? w_xml_character_filter : NULL), 			 word_fun, pre_line_fun, post_line_fun);#endif	}      }      fclose(input);    } else { /* unrecognized file name */      fprintf(stderr, "error: couldn't open %s\n", argv[optind]);      usage(argv);      exit(0);    }    optind++;  }  /* in case no files were specified, get input from stdin */  if( !(options & (1<<INPUT_FROM_CMDLINE)) ) {    if( (options & (1<<OPTION_VERBOSE)) && 	!(options & (1<<OPTION_CLASSIFY)) ) {      fprintf(stdout, "taking input from stdin\n");    }    /* set some initial options */    reset_xml_character_filter();    if( options & (1<<OPTION_MBOX_FORMAT) ) {      reset_mbox_line_filter();      if( !(options & (1<<OPTION_I18N)) ) {	process_file(stdin, mbox_line_filter, 		     ((options & (1<<OPTION_XML)) ? xml_character_filter : NULL), 		     word_fun, pre_line_fun, post_line_fun);      } else {#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H	w_process_file(stdin, w_mbox_line_filter, 		       ((options & (1<<OPTION_XML)) ? w_xml_character_filter : NULL), 		       word_fun, pre_line_fun, post_line_fun);#endif      }    } else {       /* default to text */      if( !(options & (1<<OPTION_I18N)) ) {	process_file(stdin, NULL,		     ((options & (1<<OPTION_XML)) ? xml_character_filter : NULL), 		     word_fun, pre_line_fun, post_line_fun);	      } else {#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H	w_process_file(stdin, NULL, 		       ((options & (1<<OPTION_XML)) ? w_xml_character_filter : NULL), 		       word_fun, pre_line_fun, post_line_fun);#endif      }    }  }    if( postprocess_fun ) { (*postprocess_fun)(); }  cleanup_file_handling();  for(k = 0; k < regex_count; k++) {    regfree(&re[k].regex);  }  exit(exit_code);}
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -