lex-suffixing.c

来自「卡内基梅隆大学MaCallum开发的文本分类系统」· C语言代码 · 共 237 行

237 行

#include <bow/libbow.h>#include <ctype.h>/* bow_default_lexer_suffixing should be an indirect lexer, with a   simple lexer as its underlying lexer.  However, I got lazy, and it   should be considered that the bow_default_lexer_simple is always   the underlying lexer */#define HEADER_TWICE 1static int suffixing_doing_headers;static int suffixing_appending_headers;static char suffixing_suffix[BOW_MAX_WORD_LENGTH];static int suffixing_suffix_length;int bow_lexer_html_get_raw_word (bow_lexer *self, bow_lex *lex, 				 char *buf, int buflen);/* Put the string before the ':' into SUFFIXING_SUFFIX, and replace the   following newline with a '\0' */static voidsuffixing_snarf_suffix (bow_lex *lex){  int i;  /* Hack to get arrow to work on 23k+ research paper index */  if (! isalpha (lex->document[0]))    {      /*lex->document[0] = '\0';*/      return;    }  /*  assert (isalpha (lex->document[0])); */  suffixing_suffix[0] = 'x';  suffixing_suffix[1] = 'x';  suffixing_suffix[2] = 'x';  suffixing_suffix_length = 3;  /* Put characters into the suffix until we get to the colon */  while (lex->document[lex->document_position] != ':')    {      assert (lex->document[lex->document_position] != '\n');      if (!isalpha (lex->document[lex->document_position]))	{	  lex->document_position++;	  continue;	}      suffixing_suffix[suffixing_suffix_length++] = 	tolower (lex->document[lex->document_position++]);      assert (suffixing_suffix_length < BOW_MAX_WORD_LENGTH);    }  suffixing_suffix[suffixing_suffix_length] = '\0';  /* Throw away everything else until end of string */  i = 0;  while (lex->document[lex->document_position + i] != '\n'	 /* This second condition is necessary if we are going through	    the header twice (when HEADER_TWICE=1) */	 && lex->document[lex->document_position + i] != '\0')    {      i++;      assert (lex->document_position + i < lex->document_length);    }  lex->document[lex->document_position + i] = '\0';}/* Create and return a BOW_LEX, filling the document buffer from   characters in FP, starting after the START_PATTERN, and ending with   the END_PATTERN. */bow_lex *bow_lexer_suffixing_open_text_fp (bow_lexer *self, 				  FILE *fp,				  const char *filename){  bow_lex *ret;  ret = bow_lexer_simple_open_text_fp (self, fp, filename);  if (ret)    {      /* Make sure that the first line has a header-type suffix. */      int i;      for (i = 0; i < ret->document_length && ret->document[i] != ':'; i++)	if (!isalnum (ret->document[i]) || ret->document[i] == '\n')	    return 0;      suffixing_doing_headers = 1;      suffixing_appending_headers = 1;      suffixing_snarf_suffix (ret);    }  return ret;}/* Create and return a BOW_LEX, filling the document buffer from   characters in FP, starting after the START_PATTERN, and ending with   the END_PATTERN. */bow_lex *bow_lexer_suffixing_open_str (bow_lexer *self, char *buf){  bow_lex *ret;  ret = bow_lexer_simple_open_str (self, buf);  if (ret)    {      suffixing_doing_headers = 1;      suffixing_appending_headers = 1;      suffixing_snarf_suffix (ret);    }  return ret;}intbow_lexer_suffixing_postprocess_word (bow_lexer *self, bow_lex *lex, 				      char *buf, int buflen){  int len;  /* Postprocess the word */  len = bow_lexer_next_postprocess_word (self, lex, buf, buflen);  if (len != 0 && suffixing_doing_headers)    {      if (suffixing_appending_headers)	{	  strcat (buf, suffixing_suffix);	  len = strlen (buf);	  assert (len < buflen);	}      else	{	  /* Skip the `Reference.*:' words the second time through */	  if (strstr (suffixing_suffix, "xxxreference"))	    len = 0;	}    }#if 0  if (lex->document_position == lex->document_length)    return 0;#endif  /* Set up for the next word */  if (suffixing_doing_headers && lex->document[lex->document_position] == '\0')    {      /* This was two newlines in a row or the end of the file. */      if (lex->document_position == (lex->document_length - 1)	  || lex->document[lex->document_position + 1] == '\n')	{#if HEADER_TWICE	  if (!suffixing_appending_headers)	    suffixing_doing_headers = 0;	  else	    {	      lex->document_position = 0;	      suffixing_appending_headers = 0;	      suffixing_snarf_suffix (lex);	    }#else	  lex->document_position++;#endif	  suffixing_suffix[0] = '\0';	}      else	{	  lex->document_position++;	  /* Handle email messages with multi-line headers */	  if (isalnum(lex->document[lex->document_position]))	    suffixing_snarf_suffix (lex);	  else	    {	      /* No need to grab a suffix, but must replace the \n with \0 */	      int i = 0;	      while (lex->document[lex->document_position + i] != '\n'		     /* This second condition is necessary if we are going			through	the header twice (when HEADER_TWICE=1) */		     && lex->document[lex->document_position + i] != '\0')		{		  i++;		  assert (lex->document_position + i < lex->document_length);		}	      lex->document[lex->document_position + i] = '\0';	    }	}    }  return len;}/* Scan a single token from the LEX buffer, placing it in BUF, and   returning the length of the token.  BUFLEN is the maximum number of   characters that will fit in BUF.  If the token won't fit in BUF,   an error is raised. */intbow_lexer_suffixing_get_word (bow_lexer *self, bow_lex *lex, 			      char *buf, int buflen){  int wordlen;			/* number of characters in the word so far */  do     {      wordlen = bow_lexer_next_get_raw_word (self, lex, buf, buflen);      if (wordlen == 0)	{	  if (suffixing_doing_headers	      && lex->document_position < lex->document_length)	    /* We are just at the end of the headers, not at the end               of the file.  bow_lexer_suffixing_postprocess_word()               will deal with this */	    buf[0] = '\0';	  else	    return 0;	}    }  while (((wordlen = bow_lexer_suffixing_postprocess_word 	   (self, lex, buf, buflen)) == 0)	 || strstr (suffixing_suffix, "URL"));  wordlen = strlen (buf);  return wordlen;}/* A lexer that prepends all tokens by the `Date:' string at the    beginning of the line. */const bow_lexer _bow_suffixing_lexer ={  sizeof (bow_lex),  NULL,  bow_lexer_suffixing_open_text_fp,  bow_lexer_suffixing_open_str,  bow_lexer_suffixing_get_word,  NULL,  NULL,  bow_lexer_simple_close,};const bow_lexer *bow_suffixing_lexer = &_bow_suffixing_lexer;

lex-suffixing.c - 源码说明

本页面展示了「卡内基梅隆大学MaCallum开发的文本分类系统」中的 lex-suffixing.c 源码文件，采用 C语言编程语言编写，共 237 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫开发者社区收录了大量与文本分类相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?