📄 fh.c

📁 dbacl是一个通用目的的digramic贝叶斯文本分类器。它可以学习你提供的文本
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
	      /* let each category process the token */	      (*word_fun)(ntok, (token_order_t)1, 0);	      nq = ntok;	      *nq++ = DIAMOND;	    } else {	      if( ++nhow_many > ngram_order ) {		nhow_many--;		/* move all tokens down by one */		for(nq = ntok + 1; *nq != DIAMOND; nq++) {};		for(nq++, qq = ntok + 1; *nq; *qq++ = *nq++) {};		*qq = '\0';		nq = qq;	      }	      qq = ntok;	      for(n = nhow_many; n > 0; n--) {		/* let each category process the token */		(*word_fun)(qq, (token_order_t)n, 0);		qq++;		/* skip to next token and repeat */		while(*qq != DIAMOND ) { qq++; }	      }	    }	  }	  p++;	}      }      /* now summarize this line if required */      if( post_line_fun ) { (*post_line_fun)(pptextbuf); }          } else {       /* since we don't process this line, we	 reset XML tag state for next time */      xml.state = TEXT;     }  }}/*********************************************************** * WIDE CHARACTER FILE HANDLING FUNCTIONS                  * * this is needed for any locale whose character set       * * encoding can include NUL bytes inside characters        * ***********************************************************/#if defined HAVE_WCHAR_H && defined HAVE_WCTYPE_H/* returns true if the line should be processed further   depends on global mbox state */bool_t w_mbox_line_filter(wchar_t *line) {  bool_t process_line = 0; /* by default we skip the line */  /* below we decide if we want to process the line */  switch(mbox.state) {  case UNDEF:    if( mbox.prev_line_empty && 	(!wcsncmp(line, L"From ", 5) || 	 ((line[0] == L'-') && (line[1] == L'-') && !iswspace(line[2]))) ) {      /* if it starts with From_ or else it looks like --xxxxx, 	 it signals a new header */      mbox.state = HEADER;      mbox.checked_content_type = 0;    }    break;  case HEADER:    if( *line == L'\n' ) {      mbox.state = BODY;    } else if( !wcsncmp(line, L"Content-Type:", 13) ) {      mbox.checked_content_type = 1;      if( wcsstr(line + 13, L"text/") || 	  wcsstr(line + 13, L"TEXT/") ) {	mbox.mime_type = 1; /* good */      } else {	mbox.mime_type = 0; /* bad */      }    } else if( !wcsncmp(line, L"Subject:", 8) ||	       !(wcsncmp(line, L"From:", 5)) ) {      /* process subject and from line like body */      process_line = 1;    }    break;  case BODY:    if( mbox.prev_line_empty && 	(!wcsncmp(line, L"From ", 5) || 	 ((line[0] == L'-') && (line[1] == L'-') && !iswspace(line[2])))) {      /* if it starts with From_ or else it looks like --xxxxx, 	 it signals a new header */      mbox.state = HEADER;      mbox.checked_content_type = 0;    } else if( mbox.checked_content_type && mbox.mime_type ) {       process_line = 1;    } else if( !mbox.checked_content_type ) {      /* unless we explicitly are told it's not text, we process it */      /* this will let quoted uuencoded content through :-( */      process_line = 1;    }    break;  }  mbox.prev_line_empty = (*line == L'\n') ? 1 : 0; /* for next time */  return process_line;}/* removes tags in the string - modifies in place *//* the name of this function is a misnomer, since it doesn't   parse xml properly. But we just want a simple kludge    for most html flavours */void w_xml_character_filter(wchar_t *line) {  wchar_t *q;  q = line;  while( *line ) {    switch(xml.state) {    case TEXT:      /* does it look like <x where x is either alpha or punctuation? */      if( line[0] == L'<' ) {	if( !wcsncmp(line + 1, L"!--", 3) ) {	  xml.state = COMMENT;	  line += 3;	} else if( wcsncasecmp(line + 1, L"script", 6) != 0 ) {	  xml.state = SPECIAL;	  line += 6;	} else if( iswalpha(line[1]) || iswpunct(line[1]) ) {	  xml.state = TAG;	  line += 1;	}      } else {	*q++ = *line;      }      break;    case TAG:      if( (line[1] == L'>') && 	  (iswalpha(*line) || iswpunct(*line)) ) {	xml.state = TEXT;	line++;       }      break;    case COMMENT:      if( line[0] == L'-' && !wcsncmp(line + 1, L"->", 2) ) {	xml.state = TEXT;	line += 2;      }      break;    case SPECIAL:      if( line[0] == L'<' ) {	if( !wcsncasecmp(line + 1, L"/script", 7) ) {	  xml.state = TEXT;	  line += 7;	}      }      break;    }    line++;  }  *q = L'\0'; /* mark the end of the clean text string */}/* reads a text file as input, converting each lineinto a wide character representation and applies severalfilters. */void w_process_file(FILE *input, 		    int (*line_filter)(wchar_t *),		    void (*character_filter)(wchar_t *), 		    void (*word_fun)(char *, token_order_t, regex_count_t), 		    char *(*pre_line_fun)(char *),		    void (*post_line_fun)(char *)) {  char *s, *pptextbuf;  regex_count_t i;  charbuf_len_t k;  int eflag;  token_order_t z, j, n, order;  charbuf_len_t l;  charbuf_len_t wclen;  wchar_t *wp;  mbstate_t tok_shiftstate, input_shiftstate;  charbuf_len_t tok_len;  char *q, *qq;  char tok[MAX_TOKEN_LEN+2];  regmatch_t pmatch[MAX_SUBMATCH];  char *nq;  char ntok[MAX_TOKEN_LEN+2];  token_order_t nhow_many;  /* initialize the norex state */  ntok[0] = DIAMOND;  ntok[1] = '\0';  nq = ntok + 1;  nhow_many = 0;  memset(&input_shiftstate, 0, sizeof(mbstate_t));  while( !feof(input) ) {    /* read in a full line, allocating memory as necessary */    textbuf[0] = '\0';    s = textbuf;    l = textbuf_len;    k = 1;    while( fgets(s, l, input) && (strlen(s) >= (l - 1)) ) {      textbuf = realloc(textbuf, 2 * textbuf_len);      if( !textbuf ) {	fprintf(stderr, 		"error: not enough memory for input line (%d bytes)\n",		textbuf_len);	exit(0);      }      s = textbuf + textbuf_len - (k++);      l = textbuf_len;      textbuf_len *= 2;    }    /* preprocesses textbuf, optionally censors it */    if( pre_line_fun ) {      pptextbuf = (*pre_line_fun)(textbuf);      if( !pptextbuf ) { continue; }    } else {      pptextbuf = textbuf;    }    /* now convert the line into a wide character string */    if( textbuf_len > wc_textbuf_len ) {      wc_textbuf_len = textbuf_len;      wc_textbuf = realloc(wc_textbuf, wc_textbuf_len * sizeof(wchar_t));      if( !wc_textbuf ) {	fprintf(stderr, 		"error: not enough memory for wide character conversion "		"(%ld bytes)\n",		(long int)(wc_textbuf_len * sizeof(wchar_t)));	exit(0);      }    }    /* convert as much as we can of the line into wide characters */    s = pptextbuf;    k = textbuf_len;    wp = wc_textbuf;    wclen = 0;    /* since we ensured textbuf_len <= wctextbuf_len       there will never be overflow of wctextbuf below */    while( k > 0 ) {      l = mbrtowc(wp, s, k, &input_shiftstate);      if( l > 0 ) {	wp++;	wclen++;	k -= l;	s += l;      } else if( l == 0 ) {	break;      } else if( l == -1 ) {	/* try to be robust */	s++; 	k--;	memset(&input_shiftstate, 0, sizeof(mbstate_t));      } else if( l == -2) {	/* couldn't parse a complete character */	break;      }    }    *wp = L'\0';    /* next we check to see if this line should be skipped */    if( (wclen > 0) && 	(!line_filter || (*line_filter)(wc_textbuf)) ) {      /* now filter some of the characters in the current line */      if( character_filter ) { (*character_filter)(wc_textbuf); }      /* repeat for each regular expression:	 find all the instances of a matching substring */#if defined HAVE_LIBBOOST_REGEX      for(i = 0; i < regex_count; i++) {	k = 0;	eflag = 0;	/* see if a match */	while( (k < wclen) && (regexec(&re[i].regex, wc_textbuf + k, 				   MAX_SUBMATCH, pmatch, eflag) == 0) ) { 	  /* all the submatches (delimited by brackets in the regex)  	     get converted, concatenated and the result gets word_fun'd */	  q = tok;	  *q++ = DIAMOND;	  memset(&tok_shiftstate, 0, sizeof(mbstate_t)); 	  for(order = 0, z = 1; 	      (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) {	    if( !(re[i].submatches & (1<<z)) ) 	      { continue; } else { order++; } 	    /* transcribe the submatch into tok */ 	    for(j = pmatch[z].rm_so; j < pmatch[z].rm_eo; j++) { 	      if( q < tok + MAX_TOKEN_LEN - MULTIBYTE_EPSILON ) {		if( options & (1<<OPTION_CASEN) ) {		  tok_len = wcrtomb(q, wc_textbuf[k + j], &tok_shiftstate); 		} else {		  tok_len = wcrtomb(q, towlower(wc_textbuf[k + j]), 				    &tok_shiftstate); 		}		if( (tok_len > -1) ) {		  q += tok_len;		}	      }	    }	    *q++ = DIAMOND;	  } 	  *q = '\0';  	  /* now let each category process the token */ 	  (*word_fun)(tok, order, i + 1); 	  k += pmatch[0].rm_so + 1; /* advance string and repeat */ 	  eflag = REG_NOTBOL;	}      }#else       /* the GNU regex routines expect a multibyte string */      if( textbuf_len > aux_textbuf_len ) {	aux_textbuf_len = textbuf_len;	aux_textbuf = realloc(aux_textbuf, aux_textbuf_len);	if( !aux_textbuf ) {	  fprintf(stderr, 		  "error: not enough memory for auxiliary text buffer "		  "(%d bytes)\n",		  aux_textbuf_len);	  exit(0);	}      }      l = wcstombs(aux_textbuf, wc_textbuf, aux_textbuf_len - 1);      aux_textbuf[aux_textbuf_len - 1] = '\0';      for(i = 0; i < regex_count; i++) {	k = 0; 	eflag = 0; 	/* see if a match */ 	while( (k < l) && (regexec(&re[i].regex, aux_textbuf + k, 				   MAX_SUBMATCH, pmatch, eflag) == 0) ) { 	  /* all the submatches (delimited by brackets in the regex)  	     get concatenated and the result gets word_fun'd */ 	  q = tok;	  *q++ = DIAMOND; 	  for(order = 0, z = 1; 	      (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) {	    if( !(re[i].submatches & (1<<z)) ) 	      { continue; } else { order++; } 	    /* transcribe the submatch into tok */ 	    for(j = pmatch[z].rm_so; j < pmatch[z].rm_eo; j++) { 	      if( q < tok + MAX_TOKEN_LEN ) {		if( options & (1<<OPTION_CASEN) ) {		  *q++ = aux_textbuf[k + j];		} else {		  *q++ = tolower(aux_textbuf[k + j]);		} 	      } 	    }	    *q++ = DIAMOND; /* hope DIAMOND can never be matched inside regex */ 	  } 	  *q = '\0';  	  /* now let each category process the token */	  (*word_fun)(tok, order, i + 1); 	  k += pmatch[0].rm_so + 1; /* advance string and repeat */ 	  eflag = REG_NOTBOL; 	}	            }      #endif      /* default processing: reads tokens, converting them	 to multibyte representation before passing them to	 the word_fun */      if( options & (1<<OPTION_NOREGEX) ) {	wp = wc_textbuf;	memset(&tok_shiftstate, 0, sizeof(mbstate_t));	while( *wp ) {	  if( iswalpha(*wp) ) {	    if( !(options & (1<<OPTION_CASEN)) ) {	      *wp = towlower(*wp);	    }	    if( nq < ntok + MAX_TOKEN_LEN - MULTIBYTE_EPSILON ) {	      tok_len = wcrtomb(nq, *wp, &tok_shiftstate); 	      if( (tok_len > -1) ) {		nq += tok_len;	      }	    }	  } else if( *(nq - 1) != DIAMOND ) { /* token boundary */	    *nq++ = DIAMOND;	    *nq = '\0';	    if( ngram_order == 1 ) {	      /* let each category process the token */	      (*word_fun)(ntok, (token_order_t)1, 0);	      nq = ntok;	      *nq++ = DIAMOND;	    } else {	      if( ++nhow_many > ngram_order ) {		nhow_many--;		/* move all tokens down by one */		for(nq = ntok + 1; *nq != DIAMOND; nq++) {};		for(nq++, qq = ntok + 1; *nq; *qq++ = *nq++) {};		*qq = '\0';		nq = qq;	      }	      qq = ntok;	      for(n = nhow_many; n > 0; n--) {		/* let each category process the token */		(*word_fun)(qq, (token_order_t)n, 0);		qq++;		/* skip to next token and repeat */		while(*qq != DIAMOND ) { qq++; }	      }	    }	  }	  wp++;	}      }      /* now summarize this line if required */      if( post_line_fun ) { (*post_line_fun)(pptextbuf); }          } else {       /* since we don't process this line, we	 reset XML tag state for next time */      xml.state = TEXT;     }  }}#endif /* DISABLE_WCHAR */
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -