⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 stop.c

📁 信息检索中常用的技术
💻 C
📖 第 1 页 / 共 2 页
字号:
/*********************************   stop.c   ********************************    Purpose:     Stop list filter finite state machine generator and driver.    Provenence:  Written by and unit tested by C. Fox, 1990.                 Changed by C. Fox, July 1991.                    - added ANSI C declarations                    - branch tested to 95% coverage    Notes:       This module implements a fast finite state machine                  generator, and a driver, for implementing stop list                  filters.  The strlist module is a simple string array                 data type implementation.**/#include <stdio.h>#include <ctype.h>#include <string.h>#include <malloc.h>#include "stop.h"#include "strlist.h"/******************************************************************************/#define FALSE                        0#define TRUE                         1#define EOS                        '\0'         /**************   Character Classification   ***************/         /* Tokenizing requires that ASCII be broken into character */         /* classes distinguished for tokenizing.  Delimiter chars  */         /* separate tokens.  Digits and letters make up the body   */         /* of search terms.                                        */typedef enum {                DELIM_CH,              /* whitespace, punctuation, etc. */                DIGIT_CH,              /* the digits */                LETTER_CH,             /* upper and lower case */            } CharClassType;static CharClassType char_class[128] = {      /* ^@ */  DELIM_CH,    /* ^A */  DELIM_CH,    /* ^B */  DELIM_CH,      /* ^C */  DELIM_CH,    /* ^D */  DELIM_CH,    /* ^E */  DELIM_CH,      /* ^F */  DELIM_CH,    /* ^G */  DELIM_CH,    /* ^H */  DELIM_CH,      /* ^I */  DELIM_CH,    /* ^J */  DELIM_CH,    /* ^K */  DELIM_CH,      /* ^L */  DELIM_CH,    /* ^M */  DELIM_CH,    /* ^N */  DELIM_CH,      /* ^O */  DELIM_CH,    /* ^P */  DELIM_CH,    /* ^Q */  DELIM_CH,      /* ^R */  DELIM_CH,    /* ^S */  DELIM_CH,    /* ^T */  DELIM_CH,      /* ^U */  DELIM_CH,    /* ^V */  DELIM_CH,    /* ^W */  DELIM_CH,      /* ^X */  DELIM_CH,    /* ^Y */  DELIM_CH,    /* ^Z */  DELIM_CH,      /* ^[ */  DELIM_CH,    /* ^\ */  DELIM_CH,    /* ^] */  DELIM_CH,      /* ^^ */  DELIM_CH,    /* ^_ */  DELIM_CH,    /*    */  DELIM_CH,      /*  ! */  DELIM_CH,    /*  " */  DELIM_CH,    /*  # */  DELIM_CH,      /*  $ */  DELIM_CH,    /*  % */  DELIM_CH,    /*  & */  DELIM_CH,      /*  ' */  DELIM_CH,    /*  ( */  DELIM_CH,    /*  ) */  DELIM_CH,      /*  * */  DELIM_CH,    /*  + */  DELIM_CH,    /*  , */  DELIM_CH,      /*  - */  DELIM_CH,    /*  . */  DELIM_CH,    /*  / */  DELIM_CH,      /*  0 */  DIGIT_CH,    /*  1 */  DIGIT_CH,    /*  2 */  DIGIT_CH,      /*  3 */  DIGIT_CH,    /*  4 */  DIGIT_CH,    /*  5 */  DIGIT_CH,      /*  6 */  DIGIT_CH,    /*  7 */  DIGIT_CH,    /*  8 */  DIGIT_CH,      /*  9 */  DIGIT_CH,    /*  : */  DELIM_CH,    /*  ; */  DELIM_CH,      /*  < */  DELIM_CH,    /*  = */  DELIM_CH,    /*  > */  DELIM_CH,      /*  ? */  DELIM_CH,    /*  @ */  DELIM_CH,    /*  A */  LETTER_CH,      /*  B */  LETTER_CH,   /*  C */  LETTER_CH,   /*  D */  LETTER_CH,      /*  E */  LETTER_CH,   /*  F */  LETTER_CH,   /*  G */  LETTER_CH,      /*  H */  LETTER_CH,   /*  I */  LETTER_CH,   /*  J */  LETTER_CH,      /*  K */  LETTER_CH,   /*  L */  LETTER_CH,   /*  M */  LETTER_CH,      /*  N */  LETTER_CH,   /*  O */  LETTER_CH,   /*  P */  LETTER_CH,      /*  Q */  LETTER_CH,   /*  R */  LETTER_CH,   /*  S */  LETTER_CH,      /*  T */  LETTER_CH,   /*  U */  LETTER_CH,   /*  V */  LETTER_CH,      /*  W */  LETTER_CH,   /*  X */  LETTER_CH,   /*  Y */  LETTER_CH,      /*  Z */  LETTER_CH,   /*  [ */  DELIM_CH,    /*  \ */  DELIM_CH,      /*  ] */  DELIM_CH,    /*  ^ */  DELIM_CH,    /*  _ */  DELIM_CH,      /*  ` */  DELIM_CH,    /*  a */  LETTER_CH,   /*  b */  LETTER_CH,      /*  c */  LETTER_CH,   /*  d */  LETTER_CH,   /*  e */  LETTER_CH,      /*  f */  LETTER_CH,   /*  g */  LETTER_CH,   /*  h */  LETTER_CH,      /*  i */  LETTER_CH,   /*  j */  LETTER_CH,   /*  k */  LETTER_CH,      /*  l */  LETTER_CH,   /*  m */  LETTER_CH,   /*  n */  LETTER_CH,      /*  o */  LETTER_CH,   /*  p */  LETTER_CH,   /*  q */  LETTER_CH,      /*  r */  LETTER_CH,   /*  s */  LETTER_CH,   /*  t */  LETTER_CH,      /*  u */  LETTER_CH,   /*  v */  LETTER_CH,   /*  w */  LETTER_CH,      /*  x */  LETTER_CH,   /*  y */  LETTER_CH,   /*  z */  LETTER_CH,      /*  { */  DELIM_CH,    /*  | */  DELIM_CH,    /*  } */  DELIM_CH,      /*  ~ */  DELIM_CH,    /* ^? */  DELIM_CH,                          };         /**************   Character Case Conversion   **************/         /* Term text must be accumulated in a single case.  This   */         /* array is used to convert letter case but otherwise      */         /* preserve characters.                                    */static char convert_case[128] = {      /* ^@ */    0,    /* ^A */    0,    /* ^B */    0,    /* ^C */    0,      /* ^D */    0,    /* ^E */    0,    /* ^F */    0,    /* ^G */    0,      /* ^H */    0,    /* ^I */    0,    /* ^J */    0,    /* ^K */    0,      /* ^L */    0,    /* ^M */    0,    /* ^N */    0,    /* ^O */    0,      /* ^P */    0,    /* ^Q */    0,    /* ^R */    0,    /* ^S */    0,      /* ^T */    0,    /* ^U */    0,    /* ^V */    0,    /* ^W */    0,      /* ^X */    0,    /* ^Y */    0,    /* ^Z */    0,    /* ^[ */    0,      /* ^\ */    0,    /* ^] */    0,    /* ^^ */    0,    /* ^_ */    0,      /*    */  ' ',    /*  ! */  '!',    /*  " */  '"',    /*  # */  '#',      /*  $ */  '$',    /*  % */  '%',    /*  & */  '&',    /*  ' */ '\'',      /*  ( */  '(',    /*  ) */  ')',    /*  * */  '*',    /*  + */  '+',      /*  , */  ',',    /*  - */  '-',    /*  . */  '.',    /*  / */  '/',      /*  0 */  '0',    /*  1 */  '1',    /*  2 */  '2',    /*  3 */  '3',      /*  4 */  '4',    /*  5 */  '5',    /*  6 */  '6',    /*  7 */  '7',      /*  8 */  '8',    /*  9 */  '9',    /*  : */  ':',    /*  ; */  ';',      /*  < */  '<',    /*  = */  '=',    /*  > */  '>',    /*  ? */  '?',      /*  @ */  '@',    /*  A */  'a',    /*  B */  'b',    /*  C */  'c',      /*  D */  'd',    /*  E */  'e',    /*  F */  'f',    /*  G */  'g',      /*  H */  'h',    /*  I */  'i',    /*  J */  'j',    /*  K */  'k',      /*  L */  'l',    /*  M */  'm',    /*  N */  'n',    /*  O */  'o',      /*  P */  'p',    /*  Q */  'q',    /*  R */  'r',    /*  S */  's',      /*  T */  't',    /*  U */  'u',    /*  V */  'v',    /*  W */  'w',      /*  X */  'x',    /*  Y */  'y',    /*  Z */  'z',    /*  [ */  '[',      /*  \ */   92,    /*  ] */  ']',    /*  ^ */  '^',    /*  _ */  '_',      /*  ` */  '`',    /*  a */  'a',    /*  b */  'b',    /*  c */  'c',      /*  d */  'd',    /*  e */  'e',    /*  f */  'f',    /*  g */  'g',      /*  h */  'h',    /*  i */  'i',    /*  j */  'j',    /*  k */  'k',      /*  l */  'l',    /*  m */  'm',    /*  n */  'n',    /*  o */  'o',      /*  p */  'p',    /*  q */  'q',    /*  r */  'r',    /*  s */  's',      /*  t */  't',    /*  u */  'u',    /*  v */  'v',    /*  w */  'w',      /*  x */  'x',    /*  y */  'y',    /*  z */  'z',    /*  { */  '{',      /*  | */  '|',    /*  } */  '}',    /*  ~ */  '~',    /* ^? */    0, };#define DEAD_STATE                    -1   /* used to block a DFA */#define TABLE_INCREMENT              256   /* used to grow tables */       /*************************   Hashing   **************************/       /* Sets of suffixes labeling states during the DFA construction */       /* are hashed to speed searching.  The hashing function uses an */       /* entire integer variable range as its hash table size;  in an */       /* effort to get a good spread through this range, hash values  */       /* start big, and are incremented by a lot with every new word  */       /* in the list.  The collision rate is low using this method    */#define HASH_START               5775863#define HASH_INCREMENT          38873647       /**************   State Label Binary Search Tree   **************/       /* During DFA construction, all states must be searched by      */       /* their labels to make sure that the minimum number of states  */       /* are used.  This operation is sped up by hashing the labels   */       /* to a signature value, then storing the signatures and labels */       /* in a binary search tree.  The tree is destroyed once the DFA */       /* is fully constructed.                                        */typedef struct TreeNode {       StrList label;            /* state label used as search key     */       unsigned signature;       /* hashed label to speed searching    */       int state;                /* whose label is representd by node  */       struct TreeNode *left;    /* left binary search subtree         */       struct TreeNode *right;   /* right binary search subtree        */       } SearchTreeNode, *SearchTree;       /*********************   DFA State Table   **********************/       /* The state table is an array of structures holding a state    */       /* label, a count of the arcs out of the state, a pointer into  */       /* the arc table for these arcs, and a final state flag.  The   */       /* label field is used only during machine construction.        */typedef struct {       StrList label;            /* for this state - used during build */       int num_arcs;             /* for this state in the arc table    */       int arc_offset;           /* for finding arcs in the arc table  */       short is_final;           /* TRUE iff this is a final state     */       } StateTableEntry, *StateTable;       /**********************   DFA Arc Table   ***********************/       /* The arc table lists all transitions for all states in a DFA  */       /* in compacted form.  Each state's transitions are offset from */       /* the start of the table, then listed in arc label order.      */       /* Transitions are found by a linear search of the sub-section  */       /* of the table for a given state.                              */typedef struct {       char label;               /* character label on an out-arrow    */       int target;               /* the target state for the out-arrow */       } ArcTableEntry, *ArcTable;       /**********************   DFA Structure   ***********************/       /* A DFA is represented as a pointer to a structure holding the */       /* machine's state and transition tables, and bookkeepping      */       /* counters.  The tables are arrays whose space is malloc'd,    */       /* then realloc'd if more space is required.  Once a machine is */       /* constructed, the table space is realloc'd one last time to   */       /* fit the needs of the machine exactly.                        */typedef struct _DfaStruct {       int num_states;           /* in the DFA (and state table)       */       int max_states;           /* now allocated in the state table   */       int num_arcs;             /* in the arc table for this machine  */       int max_arcs;             /* now allocated in the arc table     */       StateTable state_table;   /* the compacted DFA state table      */       ArcTable arc_table;       /* the compacted DFA transition table */       SearchTree tree;          /* storing state labels used in build */       } DFAStruct;/******************************************************************************//*************************   Function Declarations   **************************/#ifdef __STDC__static char *GetMemory( char *ptr, int num_bytes );static void  DestroyTree( SearchTree tree );static int   GetState( DFA machine, StrList label, unsigned signature );static void  AddArc( DFA machine, int state, char arc_label,                     StrList state_label, unsigned state_signature );extern DFA   BuildDFA( StrList words );extern char *GetTerm( FILE *stream, DFA machine, int size, char *output );#elsestatic char *GetMemory();static void  DestroyTree();static int   GetState();static void  AddArc();extern DFA   BuildDFA();extern char *GetTerm();#endif/******************************************************************************//************************   Private Function Definitions   ********************//*FN***************************************************************************        GetMemory( ptr, num_bytes )   Returns: char * -- new/expanded block of memory   Purpose: Rationalize memory allocation and handle errors   Plan:    Part 1: Allocate memory with supplied allocation functions            Part 2: Handle any errors            Part 3: Return the allocated block of memory   Notes:   None.**/static char *GetMemory( ptr, num_bytes )   char *ptr;      /* in: expanded block; NULL if nonesuch */   int num_bytes;  /* in: number of bytes to allocate */   {   char *memory;   /* temporary for holding results */        /* Part 1: Allocate memory with supplied allocation functions */   if ( NULL == ptr )      memory = malloc( (unsigned)num_bytes );   else      memory = realloc( ptr, (unsigned)num_bytes );                    /* Part 2: Handle any errors */   if ( NULL == memory )      {      (void)fprintf( stderr, "malloc failure--aborting\n" );      exit(1);      }             /* Part 3: Return the allocated block of memory */   return( memory );   } /* GetMemory *//*FN***************************************************************************        DestroyTree( tree )   Returns: void   Purpose: Destroy a binary search tree created during machine construction   Plan:    Part 1: Return right away of there is no tree            Part 2: Deallocate the subtrees            Part 3: Deallocate the root   Notes:   None.**/static voidDestroyTree( tree )   SearchTree tree;   /* in: search tree destroyed */   {              /* Part 1: Return right away of there is no tree */   if ( NULL == tree ) return;                     /* Part 2: Deallocate the subtrees */   if ( NULL != tree->left )  DestroyTree( tree->left );   if ( NULL != tree->right ) DestroyTree( tree->right );                      /* Part 3: Deallocate the root */   tree->left = tree->right = NULL;   (void)free( (char *)tree );   } /* DestroyTree *//*FN***************************************************************************        GetState( machine, label, signature )   Returns: int -- state with the given label   Purpose: Search a machine and return the state with a given state label   Plan:    Part 1: Search the tree for the requested state

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -