📄 fh.c

📁 dbacl是一个通用目的的digramic贝叶斯文本分类器。它可以学习你提供的文本
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*  * Copyright (C) 2002 Laird Breyer *   * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. *  * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. *  * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *  * Author:   Laird Breyer <laird@lbreyer.com> */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <ctype.h>#include <string.h>#include <stdlib.h>#include "dbacl.h"extern options_t options;extern Regex re[MAX_RE];extern regex_count_t regex_count;/* global variables */MBOX_State mbox;XML_State xml;char *textbuf = NULL;charbuf_len_t textbuf_len = 0;char *aux_textbuf = NULL;charbuf_len_t aux_textbuf_len = 0;#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_Hwchar_t *wc_textbuf = NULL;charbuf_len_t wc_textbuf_len = 0;#endiftoken_order_t ngram_order = 1; /* default */#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H/* compiler doesn't seem to know this function is in the  * library, so we define our own - bug or just plain weird? */int wcsncasecmp(const wchar_t *s1, const wchar_t *s2, size_t n) {  register size_t i = 0;  while( i < n ) {    if( tolower(*s1) != tolower(*s2) ) {      return tolower(*s1) - tolower(*s2);    }    s1++;    s2++;  }  return 0;}#endifvoid init_file_handling() {  /* preallocate primary text holding buffer */  textbuf_len = BUFLEN;  textbuf = malloc(textbuf_len);}void cleanup_file_handling() {  /* free some global resources */  free(textbuf);}void reset_mbox_line_filter() {  mbox.state = UNDEF;  mbox.prev_line_empty = 1;}void reset_xml_character_filter() {  xml.state = TEXT;}/*********************************************************** * MULTIBYTE FILE HANDLING FUNCTIONS                       * * this is suitable for any locale whose character set     * * encoding doesn't include NUL bytes inside characters    * ***********************************************************//* returns true if the line should be processed further   depends on global mbox state *//* note: mail filtering is very complicated, since   any message can be mangled and requoted by a sender   this will only cope with clean mail boxes */bool_t mbox_line_filter(char *line) {  bool_t process_line = 0; /* by default we skip the line */  /* below we decide if we want to process the line */  switch(mbox.state) {  case UNDEF:    if( mbox.prev_line_empty && 	(!strncmp(line, "From ", 5) || 	 ((line[0] == '-') && (line[1] == '-') && !isspace(line[2]))) ) {      /* if it starts with From_ or else it looks like --xxxxx, 	 it signals a new header */      mbox.state = HEADER;      mbox.checked_content_type = 0;    }    break;  case HEADER:    if( *line == '\n' ) {      mbox.state = BODY;    } else if( !strncmp(line, "Content-Type:", 13) ) {      mbox.checked_content_type = 1;      if( strstr(line + 13, "text/") || 	  strstr(line + 13, "TEXT/") ) {	mbox.mime_type = 1; /* good */      } else {	mbox.mime_type = 0; /* bad */      }    } else if( !strncmp(line, "Subject:", 8) ||	       !(strncmp(line, "From:", 5)) ) {      /* process subject and from line like body */      process_line = 1;    }    break;  case BODY:      if( mbox.prev_line_empty && 	  (!strncmp(line, "From ", 5) || 	   ((line[0] == '-') && (line[1] == '-') && !isspace(line[2])))) {	/* if it starts with From_ or else it looks like --xxxxx, 	   it signals a new header */	mbox.state = HEADER;	mbox.checked_content_type = 0;      } else if( mbox.checked_content_type && mbox.mime_type ) { 	process_line = 1;      } else if( !mbox.checked_content_type ) {	process_line = 1; /* be lenient */      }      break;    }    mbox.prev_line_empty = (*line == '\n') ? 1 : 0; /* for next time */    return process_line;}/* removes tags in the string - modifies in place *//* the name of this function is a misnomer, since it doesn't   parse xml properly. But we just want a simple kludge    for most html flavours */void xml_character_filter(char *line) {  char *q;  q = line;  while( *line ) {    switch(xml.state) {    case TEXT:      /* does it look like <x where x is either alpha or punctuation? */      if( line[0] == '<' ) {	if( !strncmp(line + 1, "!--", 3) ) {	  xml.state = COMMENT;	  line += 3;	} else if( !strncasecmp(line + 1, "script", 6) ) {	  xml.state = SPECIAL;	  line += 6;	} else if( isalpha(line[1]) || ispunct(line[1]) ) {	  xml.state = TAG;	  line += 1;	}      } else {	*q++ = *line;      }      break;    case TAG:      if( (line[1] == '>') && 	  (isalpha(*line) || ispunct(*line)) ) {	xml.state = TEXT;	line++;       }      break;    case COMMENT:      if( line[0] == '-' && !strncmp(line + 1, "->", 2) ) {	xml.state = TEXT;	line += 2;      }      break;    case SPECIAL:      if( line[0] == '<' ) {	if( !strncasecmp(line + 1, "/script", 7) ) {	  xml.state = TEXT;	  line += 7;	}      }      break;    }    line++;  }  *q = '\0'; /* mark the end of the clean text string */}#define MULTIBYTE_EPSILON 10 /* enough for a multibyte char and a null char *//* reads a text file as input, converting each lineinto a wide character representation and applies severalfilters. */void process_file(FILE *input, 		  int (*line_filter)(char *),		  void (*character_filter)(char *), 		  void (*word_fun)(char *, token_order_t, regex_count_t), 		  char *(*pre_line_fun)(char *),		  void (*post_line_fun)(char *)) {  char *s, *pptextbuf;  regex_count_t i;  charbuf_len_t k;  int eflag;  token_order_t z, j, n, order;  charbuf_len_t l;  char *p, *q, *qq;  char tok[MAX_TOKEN_LEN+2];  regmatch_t pmatch[MAX_SUBMATCH];  char ntok[MAX_TOKEN_LEN+2];  char *nq;  token_order_t nhow_many;#if defined HAVE_LIBBOOST_REGEX  charbuf_len_t wclen;  wchar_t *wp;  mbstate_t tok_shiftstate, input_shiftstate;  charbuf_len_t tok_len;  memset(&input_shiftstate, 0, sizeof(mbstate_t));#endif  /* initialize the norex state */  ntok[0] = DIAMOND;  ntok[1] = '\0';  nq = ntok + 1;  nhow_many = 0;   /* now start processing */  while( !feof(input) ) {    /* read in a full line, allocating memory as necessary */    textbuf[0] = '\0';    s = textbuf;    l = textbuf_len;    k = 1;    while( fgets(s, l, input) && (strlen(s) >= (l - 1)) ) {      textbuf = realloc(textbuf, 2 * textbuf_len);      if( !textbuf ) {	fprintf(stderr, 		"error: not enough memory for input line (%d bytes)\n",		textbuf_len);	exit(0);      }      s = textbuf + textbuf_len - (k++);      l = textbuf_len;      textbuf_len *= 2;    }    /* preprocesses textbuf, optionally censors it */    if( pre_line_fun ) {      pptextbuf = (*pre_line_fun)(textbuf);      if( !pptextbuf ) { continue; }    } else {      pptextbuf = textbuf;    }    /* next we check to see if this line should be skipped */    if( *pptextbuf && (!line_filter || (*line_filter)(pptextbuf)) ) {      /* now filter some of the characters in the current line */      if( character_filter ) { (*character_filter)(pptextbuf); }      /* repeat for each regular expression:	 find all the instances of a matching substring */#if defined HAVE_LIBBOOST_REGEX      /* damn - we compiled wide character regexes, 	 now we've got to use them (not really, but who in their right	 mind would compile with boost if they're not going to convert	 to wide characters anyway) */      if( textbuf_len > wc_textbuf_len ) {	wc_textbuf_len = textbuf_len;	wc_textbuf = realloc(wc_textbuf, wc_textbuf_len * sizeof(wchar_t));	if( !wc_textbuf ) {	  fprintf(stderr, 		  "error: not enough memory for wide character conversion "		  "(%d bytes)\n",		  wc_textbuf_len * sizeof(wchar_t));	  exit(0);	}      }      /* convert as much as we can of the line into wide characters */      s = pptextbuf;      k = textbuf_len;      wp = wc_textbuf;      wclen = 0;      /* since we ensured textbuf_len <= wctextbuf_len	 there will never be overflow of wctextbuf below */      while( k > 0 ) {	l = mbrtowc(wp, s, k, &input_shiftstate);	if( l > 0 ) {	  wp++;	  wclen++;	  k -= l;	  s += l;	} else if( l == 0 ) {	  break;	} else if( l == -1 ) {	  /* try to be robust */	  s++; 	  k--;	  memset(&input_shiftstate, 0, sizeof(mbstate_t));	} else if( l == -2) {	  /* couldn't parse a complete character */	  break;	}      }      *wp = L'\0';      /* now process the regex */      for(i = 0; i < regex_count; i++) {	k = 0;	eflag = 0;	/* see if a match */	while( (k < wclen) && (regexec(&re[i].regex, wc_textbuf + k, 				   MAX_SUBMATCH, pmatch, eflag) == 0) ) { 	  /* all the submatches (delimited by brackets in the regex)  	     get converted, concatenated and the result gets word_fun'd */	  q = tok;	  *q++ = DIAMOND;	  memset(&tok_shiftstate, 0, sizeof(mbstate_t)); 	  for(order = 0, z = 1; 	      (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) {	    if( !(re[i].submatches & (1<<z)) ) 	      { continue; } else { order++; } 	    /* transcribe the submatch into tok */ 	    for(j = pmatch[z].rm_so; j < pmatch[z].rm_eo; j++) { 	      if( q < tok + MAX_TOKEN_LEN - MULTIBYTE_EPSILON ) {		if( options & (1<<OPTION_CASEN) ) {		  tok_len = wcrtomb(q, wc_textbuf[k + j], &tok_shiftstate); 		} else {		  tok_len = wcrtomb(q, towlower(wc_textbuf[k + j]), 				    &tok_shiftstate); 		}		if( (tok_len > -1) ) {		  q += tok_len;		}	      }	    }	    *q++ = DIAMOND;	  } 	  *q = '\0';  	  /* now let each category process the token */	  (*word_fun)(tok, order, i + 1); 	  k += pmatch[0].rm_so + 1; /* advance string and repeat */ 	  eflag = REG_NOTBOL;	}      }#else       /* the GNU regex routines expect an ordinary string */      for(i = 0; i < regex_count; i++) {	k = 0; 	eflag = 0; 	/* see if a match */ 	while( (k < l) && (regexec(&re[i].regex, pptextbuf + k, 				   MAX_SUBMATCH, pmatch, eflag) == 0) ) { 	  /* all the submatches (delimited by brackets in the regex)  	     get concatenated and the result gets word_fun'd */ 	  q = tok;	  *q++ = DIAMOND; 	  for(order = 0, z = 1; 	      (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) {	    if( !(re[i].submatches & (1<<z)) ) 	      { continue; } else { order++; } 	    /* transcribe the submatch into tok */ 	    for(j = pmatch[z].rm_so; j < pmatch[z].rm_eo; j++) { 	      if( q < tok + MAX_TOKEN_LEN ) {		if( options & (1<<OPTION_CASEN) ) {		  *q++ = pptextbuf[k + j];		} else {		  *q++ = tolower(pptextbuf[k + j]);		} 	      } 	    }	    *q++ = DIAMOND; 	  } 	  *q = '\0';  	  /* now let each category process the token */	  (*word_fun)(tok, order, i + 1); 	  k += pmatch[0].rm_so + 1; /* advance string and repeat */ 	  eflag = REG_NOTBOL; 	}	            }      #endif      /* default processing: reads tokens and passes them to	 the word_fun */      if( options & (1<<OPTION_NOREGEX) ) {	p = pptextbuf;	while( *p ) {	  if( isalpha(*p) ) {	    if( nq < ntok + MAX_TOKEN_LEN ) {	      if( !(options & (1<<OPTION_CASEN)) ) {		*nq++ = tolower(*p);	      } else {		*nq++ = *p;	      }	    }	  } else if( *(nq - 1) != DIAMOND ) { /* token boundary */	    *nq++ = DIAMOND;	    *nq = '\0';	    if( ngram_order == 1 ) {
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -