📄 bayesol.c
字号:
/* reads a text file as input, and applies several filters. */void process_file(FILE *input, void (*line_fun)(void)) { char *s; category_count_t i; charbuf_len_t k; charbuf_len_t l; RegMatch *r; regmatch_t pmatch[MAX_SUBMATCH]; submatch_order_t z;#if defined HAVE_LIBBOOST_REGEX charbuf_len_t rr;#endif /* now start processing */ while( !feof(input) ) { /* read in a full line, allocating memory as necessary */ textbuf[0] = '\0'; s = textbuf; l = textbuf_len; k = 1; while( fgets(s, l, input) && (strlen(s) >= (l - 1)) ) { textbuf = realloc(textbuf, 2 * textbuf_len); if( !textbuf ) { fprintf(stderr, "error: not enough memory for input line (%d bytes)\n", textbuf_len); exit(0); } s = textbuf + textbuf_len - (k++); l = textbuf_len; textbuf_len *= 2; } /* now summarize this line if required */ if( line_fun ) { (*line_fun)(); } if( (textbuf[0] == 's') && (strncmp(MAGIC, textbuf, 7) == 0) ) { if( !parse_dbacl_scores(textbuf) ) { fprintf(stderr, "error: scores don't match risk specification\n"); exit(0); } else if( options & (1<<OPTION_DEBUG) ) { for(i = 0; i < spec.num_cats; i++) { fprintf(stdout, "category %s\t cross_entropy %7.2f complexity %7.0f\n", spec.catname[i], spec.cross_entropy[i], spec.complexity[i]); } fprintf(stdout, "\n"); } } else { /* for each regex in our list, try for a match */#if defined HAVE_LIBBOOST_REGEX /* boost regexes need a wide char string */ /* now convert the line into a wide character string */ if( textbuf_len > wc_textbuf_len ) { wc_textbuf_len = textbuf_len; wc_textbuf = realloc(wc_textbuf, wc_textbuf_len * sizeof(wchar_t)); if( !wc_textbuf ) { fprintf(stderr, "error: not enough memory for wide character conversion " "(%d bytes)\n", wc_textbuf_len * sizeof(wchar_t)); exit(0); } } rr = mbstowcs(wc_textbuf, textbuf, wc_textbuf_len); if( rr <= 0 && (strlen(textbuf) > 0) ) { fprintf(stderr, "error: couldn't convert an input string for matching, ignoring regexes\n"); } else { wc_textbuf[rr] = L'\0'; for( r = spec.regs; r != 0; r = r->next) { if( regexec(&(r->reg), wc_textbuf, MAX_SUBMATCH, pmatch, 0) == 0 ) { r->lv->found = 1; /* convert each submatch to a number - pad remaining elements to zero */ for(z = 1; z < MAX_SUBMATCH; z++) { if(pmatch[z].rm_so > -1) { r->lv->sm[z-1] = wcstod(wc_textbuf + pmatch[z].rm_so, NULL); } else { r->lv->sm[z-1] = 0.0; } } if( options & (1<<OPTION_DEBUG) ) { fprintf(stdout, "match \"%s\"", r->lv->re); for(z = 1; (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) { fprintf(stdout, " %f", r->lv->sm[z-1]); } fprintf(stdout, "\n"); } } } }#else /* GNU regexes expect an ordinary string */ for( r = spec.regs; r != 0; r = r->next) { if( regexec(&(r->reg), textbuf, MAX_SUBMATCH, pmatch, 0) == 0 ) { r->lv->found = 1; /* convert each submatch to a number - pad remaining elements to zero */ for(z = 1; z < MAX_SUBMATCH; z++) { if(pmatch[z].rm_so > -1) { r->lv->sm[z-1] = strtod(textbuf + pmatch[z].rm_so, NULL); } else { r->lv->sm[z-1] = 0.0; } } if( options & (1<<OPTION_DEBUG) ) { fprintf(stdout, "match \"%s\"", r->lv->re); for(z = 1; (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) { fprintf(stdout, " %f", r->lv->sm[z-1]); } fprintf(stdout, "\n"); } } }#endif } }}/*********************************************************** * WIDE CHARACTER FILE HANDLING FUNCTIONS * * this is needed for any locale whose character set * * encoding can include NUL bytes inside characters * * * * Actually, at present this is quite useless. But it might* * prove handy in the future. * ***********************************************************/#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H/* reads a text file as input, converting each line into a wide character representation and applies several filters. */void w_process_file(FILE *input, void (*line_fun)(void)) { char *s; charbuf_len_t k; charbuf_len_t l; charbuf_len_t wclen; wchar_t *wp; mbstate_t input_shiftstate; category_count_t i; RegMatch *r; regmatch_t pmatch[MAX_SUBMATCH]; submatch_order_t z; memset(&input_shiftstate, 0, sizeof(mbstate_t)); while( !feof(input) ) { /* read in a full line, allocating memory as necessary */ textbuf[0] = '\0'; s = textbuf; l = textbuf_len; k = 1; while( fgets(s, l, input) && (strlen(s) >= (l - 1)) ) { textbuf = realloc(textbuf, 2 * textbuf_len); if( !textbuf ) { fprintf(stderr, "error: not enough memory for input line (%d bytes)\n", textbuf_len); exit(0); } s = textbuf + textbuf_len - (k++); l = textbuf_len; textbuf_len *= 2; } /* now convert the line into a wide character string */ if( textbuf_len > wc_textbuf_len ) { wc_textbuf_len = textbuf_len; wc_textbuf = realloc(wc_textbuf, wc_textbuf_len * sizeof(wchar_t)); if( !wc_textbuf ) { fprintf(stderr, "error: not enough memory for wide character conversion " "(%ld bytes)\n", (long int)(wc_textbuf_len * sizeof(wchar_t))); exit(0); } } /* convert as much as we can of the line into wide characters */ s = textbuf; k = textbuf_len; wp = wc_textbuf; wclen = 0; /* since we ensured textbuf_len <= wctextbuf_len there will never be overflow of wctextbuf below */ while( k > 0 ) { l = mbrtowc(wp, s, k, &input_shiftstate); if( l > 0 ) { wp++; wclen++; k -= l; s += l; } else if( l == 0 ) { break; } else if( l == -1 ) { /* try to be robust */ s++; k--; memset(&input_shiftstate, 0, sizeof(mbstate_t)); } else if( l == -2) { /* couldn't parse a complete character */ break; } } *wp = L'\0'; /* now summarize this line if required */ if( line_fun ) { (*line_fun)(); } /* the scores are written by dbacl, so there's no need for the conversion */ if( (textbuf[0] == 's') && (strncmp(MAGIC, textbuf, 7) == 0) ) { if( !parse_dbacl_scores(textbuf) ) { fprintf(stderr, "error: scores don't match risk specification\n"); exit(0); } else if( options & (1<<OPTION_DEBUG) ) { for(i = 0; i < spec.num_cats; i++) { fprintf(stdout, "category %s\t cross_entropy %7.2f complexity %7.0f\n", spec.catname[i], spec.cross_entropy[i], spec.complexity[i]); } fprintf(stdout, "\n"); } } else { /* for each regex in our list, try for a match */#if defined HAVE_LIBBOOST_REGEX /* boost regexes need a wide char string */ for( r = spec.regs; r != 0; r = r->next) { if( regexec(&(r->reg), wc_textbuf, MAX_SUBMATCH, pmatch, 0) == 0 ) { r->lv->found = 1; /* convert each submatch to a number - pad remaining elements to zero */ for(z = 1; z < MAX_SUBMATCH; z++) { if(pmatch[z].rm_so > -1) { r->lv->sm[z-1] = wcstod(wc_textbuf + pmatch[z].rm_so, NULL); } else { r->lv->sm[z-1] = 0.0; } } if( options & (1<<OPTION_DEBUG) ) { fprintf(stdout, "match \"%s\"", r->lv->re); for(z = 1; (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) { fprintf(stdout, " %f", r->lv->sm[z-1]); } fprintf(stdout, "\n"); } } }#else /* GNU regexes expect an ordinary string */ for( r = spec.regs; r != 0; r = r->next) { if( regexec(&(r->reg), textbuf, MAX_SUBMATCH, pmatch, 0) == 0 ) { r->lv->found = 1; /* convert each submatch to a number - pad remaining elements to zero */ for(z = 1; z < MAX_SUBMATCH; z++) { if(pmatch[z].rm_so > -1) { r->lv->sm[z-1] = strtod(textbuf + pmatch[z].rm_so, NULL); } else { r->lv->sm[z-1] = 0.0; } } if( options & (1<<OPTION_DEBUG) ) { fprintf(stdout, "match \"%s\"", r->lv->re); for(z = 1; (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) { fprintf(stdout, " %f", r->lv->sm[z-1]); } fprintf(stdout, "\n"); } } }#endif } }}#endif /* HAVE_WCHAR_H *//*********************************************************** * MAIN FUNCTIONS * ***********************************************************/char *sanitize_path(char *in) { char *q; char *path; charbuf_len_t l; /* this bit likely fails in DOS ;-) */ if( (*in != '/') && (*in != '.') && (path = getenv("DBACL_PATH")) ) { l = strlen(path); q = malloc(l + strlen(in) + 3); strcpy(q, path); if( q[l - 1] != '/' ) { q[l] = '/'; q[l + 1] = 0; } strcat(q, in); return q; } else { return in; }}int main(int argc, char **argv) { FILE *input; char op; void (*preprocess_fun)(void) = NULL; void (*line_fun)(void) = NULL; void (*postprocess_fun)(void) = NULL; /* set up internationalization */ if( !setlocale(LC_ALL, "") ) { fprintf(stderr, "warning: could not set locale, internationalization disabled\n"); } else { if( options & (1<<OPTION_DEBUG) ) { fprintf(stderr, "warning: international locales not supported\n"); } } /* parse the options */ while( (op = getopt(argc, argv, "DVvinc:")) > -1 ) { switch(op) { case 'V': fprintf(stdout, "bayesol version %s\n", VERSION);#if defined HAVE_LIBBOOST_REGEX fprintf(stdout, "Using BOOST wide character modern regexes.\n");#elif defined __GNUC__ fprintf(stdout, "Using GNU modern regexes.\n");#else fprintf(stdout, "Using system regexes.\n");#endif exit(1); break; case 'n': options |= (1<<OPTION_SCORES); break; case 'i': options |= (1<<OPTION_I18N);#if !defined HAVE_WCHAR_H || !defined HAVE_WCTYPE_H fprintf(stderr, "warning: this tool has been compiled without wide character support. Full internationalization is disabled.\n"); options &= ~(1<<OPTION_I18N);#endif break; case 'c': if( *optarg && read_riskspec(optarg) ) { options |= (1<<OPTION_RISKSPEC); } else { fprintf(stderr, "error: could not read %s, program aborted\n", optarg); exit(0); } break; case 'v': options |= (1<<OPTION_VERBOSE); break; case 'D': options |= (1<<OPTION_DEBUG); break; default: break; } } /* end option processing */ /* consistency checks */ if( !(options & (1<<OPTION_RISKSPEC)) ){ fprintf(stderr, "error: please use -c option\n"); usage(argv); exit(0); } if( options & (1<<OPTION_I18N) ) {#if defined HAVE_LIBBOOST_REGEX#else fprintf(stderr, "warning: regexes operate in multibyte encoding.");#endif } /* set up callbacks */ if( options & (1<<OPTION_RISKSPEC) ) { preprocess_fun = setup_regexes; line_fun = NULL; /* print_line; */ postprocess_fun = finish_parsing_and_score; } else { /* something wrong ? */ usage(argv); exit(0); } if( preprocess_fun ) { (*preprocess_fun)(); } /* preallocate primary text holding buffer */ textbuf_len = BUFLEN; textbuf = malloc(textbuf_len); /* now process each file on the command line, or if none provided read stdin */ while( (optind > -1) && *(argv + optind) ) { /* if it's a filename, process it */ if( (input = fopen(argv[optind], "r")) ) { options |= (1<<INPUT_FROM_CMDLINE); if( options & (1<<OPTION_DEBUG) ) { fprintf(stdout, "processing file %s\n", argv[optind]); } if( !(options & (1<<OPTION_I18N)) ) { process_file(input, line_fun); } else {#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H w_process_file(input, line_fun);#endif } fclose(input); } else { /* unrecognized file name */ fprintf(stderr, "error: couldn't open %s\n", argv[optind]); usage(argv); exit(0); } optind++; } /* in case no files were specified, get input from stdin */ if( !(options & (1<<INPUT_FROM_CMDLINE)) ) { if( options & (1<<OPTION_DEBUG) ) { fprintf(stdout, "taking input from stdin\n"); } if( !(options & (1<<OPTION_I18N)) ) { process_file(stdin, line_fun); } else {#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H w_process_file(stdin, line_fun);#endif } } if( postprocess_fun ) { (*postprocess_fun)(); } /* free some global resources */ free(textbuf); exit(exit_code);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -