📄 dbacl.c
字号:
#if defined HAVE_LIBBOOST_REGEX fprintf(stdout, "Using BOOST wide character modern regexes.\n");#elif defined __GNUC__ fprintf(stdout, "Using GNU modern regexes.\n");#else fprintf(stdout, "Using system regexes.\n");#endif fprintf(stdout, "Feature memory requirements: %d bytes (classifying), %d bytes (learning)\n", (int)sizeof(c_item), (int)sizeof(l_item)); exit(1); break; case 'd': options |= (1<<OPTION_DUMP); break; case 'h': /* select memory size in powers of 2 */ default_max_hash_bits = strtod(optarg, NULL); if( default_max_hash_bits > MAX_HASH_BITS ) { fprintf(stderr, "warning: maximum hash size will be 2^%d\n", MAX_HASH_BITS); default_max_hash_bits = MAX_HASH_BITS; } default_max_tokens = (1<<default_max_hash_bits); break; case 'H': /* select memory size in powers of 2 */ default_max_grow_hash_bits = strtod(optarg, NULL); if( default_max_grow_hash_bits > MAX_HASH_BITS ) { fprintf(stderr, "warning: maximum hash size will be 2^%d\n", MAX_HASH_BITS); default_max_grow_hash_bits = MAX_HASH_BITS; } default_max_grow_tokens = (1<<default_max_grow_hash_bits); options |= (1<<OPTION_GROWHASH); break; case 'j': options |= (1<<OPTION_CASEN); break; case 'n': options |= (1<<OPTION_SCORES); break; case 'c': if( cat_count >= MAX_CAT ) { fprintf(stderr, "warning: maximum reached, category ignored\n"); } else if( options & (1<<OPTION_LEARN) ) { fprintf(stderr, "error: cannot use options -l and -c together\n"); usage(argv); exit(0); } else { options |= (1<<OPTION_CLASSIFY); cat[cat_count].filename = sanitize_path(optarg); if( !*optarg ) { fprintf(stderr, "error: category needs a name\n"); usage(argv); exit(0); } if( !load_category(&cat[cat_count]) ) { fprintf(stderr, "error: could not load category %s\n", cat[cat_count].filename); exit(0); } ngram_order = (ngram_order < cat[cat_count].max_order) ? cat[cat_count].max_order : ngram_order; cat_count++; } break; case 'r': options |= (1<<OPTION_REFMODEL); break; case 'w': ngram_order = atoi(optarg); if( !*optarg || (ngram_order < 1) || (ngram_order > 9) ) { fprintf(stderr, "error: the -w switch needs a number between 1 and 9\n"); exit(0); } break; case 'x': decimation = atoi(optarg); if( !*optarg || (decimation < 1) || (decimation > MAX_HASH_BITS) ) { fprintf(stderr, "warning: option -x ignored, needs an integer between 1 and %d\n", MAX_HASH_BITS); } else { options |= (1<<OPTION_DECIMATE); } break; case 'f': if( filter_count >= MAX_CAT ) { fprintf(stderr, "warning: maximum reached, filter ignored\n"); } else if( options & (1<<OPTION_LEARN) ) { fprintf(stderr, "error: cannot use options -l and -f together\n"); usage(argv); exit(0); } else if( !*optarg ) { fprintf(stderr, "error: filter must be category name or number\n"); usage(argv); exit(0); } else { options |= (1<<OPTION_FILTER); filter[filter_count] = -1; /* see if it's a known category */ for(c = 0; c < cat_count; c++) { if( !strcmp(cat[c].filename, optarg) ) { filter[filter_count] = c; break; } } /* if not recognized, see if it's a number */ if( filter[filter_count] < 0 ) { filter[filter_count] = strtod(optarg, NULL) - 1; } if( filter[filter_count] < 0 ) { /* still not recognized */ fprintf(stderr, "error: unrecognized category in -f option [%s]\n", optarg); usage(argv); exit(0); } filter_count++; } break; case 'v': options |= (1<<OPTION_VERBOSE); break; case 'l': if( options & (1<<OPTION_CLASSIFY) ) { fprintf(stderr, "error: cannot use options -l and -c together\n"); usage(argv); exit(0); } else if( options & (1<<OPTION_LEARN) ) { fprintf(stderr, "error: option -l can only occur once\n"); exit(0); } else { options |= (1<<OPTION_LEARN); learner.filename = sanitize_path(optarg); if( !*learner.filename ) { fprintf(stderr, "error: category needs a name\n"); usage(argv); exit(0); } } break; case 'g': if( options & (1<<OPTION_CLASSIFY) ) { fprintf(stderr, "error: must use option -l together with -F\n"); usage(argv); exit(0); } else { /* set up the submatch bitmap */ re[regex_count].submatches |= 0; if( (p = strrchr(optarg, '|')) && ( *(--p) == '|') ) { /* assume string ents in ||12345, use as bitmap */ *p = '\0'; for(p += 2; *p; p++) { /* assume ascii number positions */ if( !isdigit(*p) || (*p > '9') || (*p < '1')) { fprintf(stderr, "warning: could not decode bitmap for %s\n", optarg); } else { re[regex_count].submatches |= (1<<(*p - '0')); } } } else { /* no bitmap specified */ re[regex_count].submatches = ~0; }#if defined HAVE_LIBBOOST_REGEX /* BOOST regexes must be wide */ if( !(rstring = malloc((strlen(optarg) + 1) * sizeof(wchar_t))) ) { fprintf(stderr, "warning: could not prepare regular expression '%s', ignored\n", optarg); } else if( mbstowcs(rstring, optarg, strlen(optarg)) < 0 ) { fprintf(stderr, "warning: could not convert regular expression '%s', ignored\n", optarg); } else { rstring[strlen(optarg)] = L'\0'; if( regcomp(&re[regex_count].regex, rstring, REG_EXTENDED) != 0 ) { fprintf(stderr, "warning: could not compile regular expression '%s', ignored\n", optarg); } else { re[regex_count].string = rstring; regex_count++; } }#else /* GNU regexes use regular strings */ if( regcomp(&re[regex_count].regex, optarg, REG_EXTENDED) != 0 ) { fprintf(stderr, "warning: could not compile regular expression '%s', ignored\n", optarg); } else { re[regex_count].string = optarg; regex_count++; }#endif } break; case 'i': options |= (1<<OPTION_I18N);#if defined HAVE_LANGINFO_H if( !strcmp(nl_langinfo(CODESET), "UTF-8") ) { fprintf(stderr, "warning: you have UTF-8, so -i is not needed.\n"); }#endif#if !defined HAVE_WCHAR_H || !defined HAVE_WCTYPE_H fprintf(stderr, "warning: this tool was compiled without wide character support. Full internationalization is disabled.\n"); options &= ~(1<<OPTION_I18N);#endif break; default: break; } } /* end option processing */ /* consistency checks */ if( ((options>>OPTION_CLASSIFY) & 1) + ((options>>OPTION_LEARN) & 1) != 1 ) { fprintf(stderr, "error: please use either -c or -l option\n"); usage(argv); exit(0); } if( (options & (1<<OPTION_DECIMATE)) && !(options & (1<<OPTION_LEARN)) ) { fprintf(stderr, "warning: option -x ignored, applies only when learning\n"); options &= ~(1<<OPTION_DECIMATE); } if( options & (1<<OPTION_DUMP) ) { if( options & (1<<OPTION_CLASSIFY) ) { options &= ~(1<<OPTION_DUMP); /* no sense */ } else if( options & (1<<OPTION_VERBOSE) ) { options &= ~(1<<OPTION_VERBOSE); /* verbose writes garbage to stdout */ options &= ~(1<<OPTION_DEBUG); } } if( ((options>>OPTION_TEXT_FORMAT) & 1) + ((options>>OPTION_MBOX_FORMAT) & 1) > 1 ) { fprintf(stderr, "error: please use one of either -T text or -T email options\n"); usage(argv); exit(0); } if( (options & (1<<OPTION_APPEND)) && (options & (1<<OPTION_FILTER)) ) { options &= ~(1<<OPTION_APPEND); fprintf(stderr, "warning: disabling option -a, because it cannot be used with -f\n"); } /* decide if we need some options */ if( !regex_count ) { options |= (1<<OPTION_NOREGEX); } if( options & (1<<OPTION_MULTINOMIAL) ) { if( cat_count == 1 ) { if( cat[0].model_type == simple ) { options |= (1<<OPTION_CALCENTROPY); } } else if( cat_count > 1 ) { for(c = 1; c < cat_count; c++) { if( cat[c].model_type == sequential ) { break; } if( cat[c].retype != cat[c-1].retype ) { break; } } if( c == cat_count ) { options |= (1<<OPTION_CALCENTROPY); } } if( !(options & (1<<OPTION_CALCENTROPY)) ) { fprintf(stderr, "warning: -M switch ignored. " "Not all categories support multinomial calculations\n"); } } if( options & (1<<OPTION_I18N) ) {#if defined HAVE_LIBBOOST_REGEX#else fprintf(stderr, "warning: regexes operate in multibyte encoding.\n");#endif } /* set up callbacks */ if( options & (1<<OPTION_CLASSIFY) ) { if( options & (1<<OPTION_CALCENTROPY) ) { init_empirical(&empirical, default_max_tokens, default_max_hash_bits); /* sets cached to zero */ } if( cat_count == 1 ) { /* single category */ preprocess_fun = NULL; word_fun = score_word; if( options & (1<<OPTION_FILTER) ) { options |= (1<<OPTION_FASTEMP); empirical.track_features = 1; post_line_fun = line_score_single_category; postprocess_fun = NULL; } else { post_line_fun = NULL; postprocess_fun = score_single_category; } } else { /* multiple categories */ preprocess_fun = NULL; word_fun = score_word; if( options & (1<<OPTION_FILTER) ) { options |= (1<<OPTION_FASTEMP); empirical.track_features = 1; post_line_fun = line_score_multiple_categories; postprocess_fun = NULL; } else { post_line_fun = NULL; postprocess_fun = score_multiple_categories; } } } else if( options & (1<<OPTION_LEARN) ) { /* category learning */ preprocess_fun = init_learner; word_fun = hash_word_and_learn; if( options & (1<<OPTION_MBOX_FORMAT) ) { post_line_fun = count_mbox_messages; } else { post_line_fun = NULL; } postprocess_fun = optimize_learner_and_save; } else { /* something wrong ? */ usage(argv); exit(0); } /* handles some common filtering options */ if( (options & (1<<OPTION_INDENTED)) || (options & (1<<OPTION_APPEND)) ) { pre_line_fun = handle_indents_and_appends; } if( preprocess_fun ) { (*preprocess_fun)(); } init_file_handling(); /* now process each file on the command line, or if none provided read stdin */ while( (optind > -1) && *(argv + optind) ) { /* if it's a filename, process it */ if( (input = fopen(argv[optind], "r")) ) { options |= (1<<INPUT_FROM_CMDLINE); if( (options & (1<<OPTION_VERBOSE)) && !(options & (1<<OPTION_CLASSIFY))) { fprintf(stdout, "processing file %s\n", argv[optind]); } /* set some initial options */ reset_xml_character_filter(); if( options & (1<<OPTION_MBOX_FORMAT) ) { reset_mbox_line_filter(); if( !(options & (1<<OPTION_I18N)) ) { process_file(input, mbox_line_filter, ((options & (1<<OPTION_XML)) ? xml_character_filter : NULL), word_fun, pre_line_fun, post_line_fun); } else {#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H w_process_file(input, w_mbox_line_filter, ((options & (1<<OPTION_XML)) ? w_xml_character_filter : NULL), word_fun, pre_line_fun, post_line_fun);#endif } } else { /* default to text */ if( !(options & (1<<OPTION_I18N)) ) { process_file(input, NULL, ((options & (1<<OPTION_XML)) ? xml_character_filter : NULL), word_fun, pre_line_fun, post_line_fun); } else {#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H w_process_file(input, NULL, ((options & (1<<OPTION_XML)) ? w_xml_character_filter : NULL), word_fun, pre_line_fun, post_line_fun);#endif } } fclose(input); } else { /* unrecognized file name */ fprintf(stderr, "error: couldn't open %s\n", argv[optind]); usage(argv); exit(0); } optind++; } /* in case no files were specified, get input from stdin */ if( !(options & (1<<INPUT_FROM_CMDLINE)) ) { if( (options & (1<<OPTION_VERBOSE)) && !(options & (1<<OPTION_CLASSIFY)) ) { fprintf(stdout, "taking input from stdin\n"); } /* set some initial options */ reset_xml_character_filter(); if( options & (1<<OPTION_MBOX_FORMAT) ) { reset_mbox_line_filter(); if( !(options & (1<<OPTION_I18N)) ) { process_file(stdin, mbox_line_filter, ((options & (1<<OPTION_XML)) ? xml_character_filter : NULL), word_fun, pre_line_fun, post_line_fun); } else {#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H w_process_file(stdin, w_mbox_line_filter, ((options & (1<<OPTION_XML)) ? w_xml_character_filter : NULL), word_fun, pre_line_fun, post_line_fun);#endif } } else { /* default to text */ if( !(options & (1<<OPTION_I18N)) ) { process_file(stdin, NULL, ((options & (1<<OPTION_XML)) ? xml_character_filter : NULL), word_fun, pre_line_fun, post_line_fun); } else {#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H w_process_file(stdin, NULL, ((options & (1<<OPTION_XML)) ? w_xml_character_filter : NULL), word_fun, pre_line_fun, post_line_fun);#endif } } } if( postprocess_fun ) { (*postprocess_fun)(); } cleanup_file_handling(); for(k = 0; k < regex_count; k++) { regfree(&re[k].regex); } exit(exit_code);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -