📄 lex-email.c

📁 贝叶斯学习算法分类文本。基于朴素贝叶斯分类器的文本分类的通用算法
💻 C
字号:
/* A lexer with special features for handling e-mail and newsgroup messages.   Specifically, this lexer will remove headers which are in the   BOW_EMAIL_HEADERS_TO_REMOVE array.    Copyright (C) 1997 Andrew McCallum and Jason Rennie   Written by:  Andrew Kachites McCallum <mccallum@cs.cmu.edu>                and Jason Rennie <jr6b@andrew.cmu.edu>   This file is part of the Bag-Of-Words Library, `libbow'.   This library is free software; you can redistribute it and/or   modify it under the terms of the GNU Library General Public License   as published by the Free Software Foundation, version 2.      This library is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   Library General Public License for more details.   You should have received a copy of the GNU Library General Public   License along with this library; if not, write to the Free Software   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#include <bow/libbow.h>#include <ctype.h>		/* for tolower() */char **bow_email_headers_to_remove = NULL;/* String compare function used to compare headers.  Ignores case distinctions   1 => strings don't match.  0 => strings do match. */intstrcmp_ignore_case (char *str1, char *str2){  int pos = 0;  assert(str1 != NULL);  assert(str2 != NULL);  while (tolower(str1[pos]) == tolower(str2[pos]))    {      if (str1[pos] == '\0') { return 0; }      pos++;    }  return 1;}/* Assumes that lex->document[lex->document_position] is at the   beginning of a newline (immediately after a '\n').  Compares the   characters up to (but not including) the first colon against each   string in the BOW_EMAIL_HEADERS_TO_REMOVE array.  If any match is   found, the document_position value is moved to the beginning of the   next line.  Otherwise, returns document_position to where it was */voidbow_lexer_email_check_header (bow_lex *lex){  int old_document_position = lex->document_position;  char *header = bow_malloc(sizeof(char) * BOW_MAX_WORD_LENGTH);  int header_position = 0;  char byte;  int i;  /* Put header name into HEADER character array.  Stop at a non-printable     character, a colon, or an overflow of characters */  do     {      byte = lex->document[lex->document_position++];      header[header_position++] = byte;    }  while (((byte >= 33 && byte <= 57) || (byte >= 59 && byte <= 126))	 && header_position < BOW_MAX_WORD_LENGTH);  /* If it is a proper header, check to see if it is in the list, else     return the document position back to its original value.  See rfc822     for information on lexing e-mail headers. */  if (byte == 58)    {      /* Step through NULL-terminated array. */      for (i = 0; bow_email_headers_to_remove[i]; i++)	if (strcmp_ignore_case (bow_email_headers_to_remove[i], header) == 0)	  {	    while (lex->document[lex->document_position] != '\n'		   || lex->document[lex->document_position+1] == 32		   || lex->document[lex->document_position+1] == 9)	      { lex->document_position++; }	    break;	  }    }  else    {      lex->document_position = old_document_position;    }  free(header);  return;}/* Get the raw token from the document buffer by scanning forward   until we get a start character, and filling the buffer until we get   an ending character.  The resulting token in the buffer is   NULL-terminated.  Return the length of the token. */intbow_lexer_email_get_raw_word (bow_lexer_simple *self, bow_lex *lex, 			       char *buf, int buflen){  int byte;			/* characters read from the FP */  int wordlen;			/* number of characters in the word so far */  /* Ignore characters until we get a beginning character. */  do    {      byte = lex->document[lex->document_position++];      if (byte == 0)	return 0;      if (byte == '\n')	bow_lexer_email_check_header (lex);    }  while (! self->true_to_start (byte));  /* Add the first alphabetic character to the word. */  buf[0] = (self->case_sensitive) ? byte : tolower (byte);  /* Add all the following satisfying characters to the word. */  for (wordlen = 1; wordlen < buflen; wordlen++)    {      byte = lex->document[lex->document_position++];      if (byte == 0)	return 0;      if (byte == '\n')	bow_lexer_email_check_header (lex);      if (! self->false_to_end (byte))	break;      buf[wordlen] = tolower (byte);    }  if (wordlen >= buflen)    bow_error ("Encountered word longer than buffer length=%d", buflen);  /* Back up to point at the character that caused the end of the word. */  lex->document_position--;  /* Terminate it. */  buf[wordlen] = '\0';  return wordlen;}/* Scan a single token from the LEX buffer, placing it in BUF, and   returning the length of the token.  BUFLEN is the maximum number of   characters that will fit in BUF.  If the token won't fit in BUF,   an error is raised. */intbow_lexer_email_get_word (bow_lexer *self, bow_lex *lex, 			 char *buf, int buflen){#define SELF ((bow_lexer_indirect*)self)  int wordlen;			/* number of characters in the word so far */  do     {      /* Yipes, this UNDERLYING_LEXER had better be a BOW_LEXER_SIMPLE! */      wordlen = bow_lexer_email_get_raw_word 	((bow_lexer_simple*)SELF->underlying_lexer, lex, buf, buflen);      if (wordlen == 0)	return 0;    }  while ((wordlen = bow_lexer_simple_postprocess_word	  ((bow_lexer_simple*)SELF->underlying_lexer, lex, buf, buflen))	 == 0);  return wordlen;#undef SELF}/* This is declared in lex-simple.c */extern bow_lexer_simple _bow_alpha_lexer;/* A lexer that ignores headers specified in BOW_EMAIL_HEADERS_TO_REMOVE */const bow_lexer_indirect _bow_email_lexer ={  {    sizeof (typeof (_bow_email_lexer)),    bow_lexer_simple_open_text_fp,    bow_lexer_email_get_word,    bow_lexer_simple_close,    "",			/* document start pattern begins right away */    NULL			/* document end pattern goes to end */  },  (bow_lexer*)&_bow_alpha_lexer, /* default UNDERLYING_LEXER */};const bow_lexer_indirect *bow_email_lexer = &_bow_email_lexer;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -