📄 fts2_porter.c

📁 sqlite-3.4.1,嵌入式数据库.是一个功能强大的开源数据库,给学习和研发以及小型公司的发展带来了全所未有的好处.
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
**** Any upper-case characters in the US-ASCII character set ([A-Z])** are converted to lower case.  Upper-case UTF characters are** unchanged.**** Words that are longer than about 20 bytes are stemmed by retaining** a few bytes from the beginning and the end of the word.  If the** word contains digits, 3 bytes are taken from the beginning and** 3 bytes from the end.  For long words without digits, 10 bytes** are taken from each end.  US-ASCII case folding still applies.** ** If the input word contains not digits but does characters not ** in [a-zA-Z] then no stemming is attempted and this routine just ** copies the input into the input into the output with US-ASCII** case folding.**** Stemming never increases the length of the word.  So there is** no chance of overflowing the zOut buffer.*/static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){  int i, j, c;  char zReverse[28];  char *z, *z2;  if( nIn<3 || nIn>=sizeof(zReverse)-7 ){    /* The word is too big or too small for the porter stemmer.    ** Fallback to the copy stemmer */    copy_stemmer(zIn, nIn, zOut, pnOut);    return;  }  for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){    c = zIn[i];    if( c>='A' && c<='Z' ){      zReverse[j] = c + 'a' - 'A';    }else if( c>='a' && c<='z' ){      zReverse[j] = c;    }else{      /* The use of a character not in [a-zA-Z] means that we fallback      ** to the copy stemmer */      copy_stemmer(zIn, nIn, zOut, pnOut);      return;    }  }  memset(&zReverse[sizeof(zReverse)-5], 0, 5);  z = &zReverse[j+1];  /* Step 1a */  if( z[0]=='s' ){    if(     !stem(&z, "sess", "ss", 0) &&     !stem(&z, "sei", "i", 0)  &&     !stem(&z, "ss", "ss", 0)    ){      z++;    }  }  /* Step 1b */    z2 = z;  if( stem(&z, "dee", "ee", m_gt_0) ){    /* Do nothing.  The work was all in the test */  }else if(      (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))      && z!=z2  ){     if( stem(&z, "ta", "ate", 0) ||         stem(&z, "lb", "ble", 0) ||         stem(&z, "zi", "ize", 0) ){       /* Do nothing.  The work was all in the test */     }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){       z++;     }else if( m_eq_1(z) && star_oh(z) ){       *(--z) = 'e';     }  }  /* Step 1c */  if( z[0]=='y' && hasVowel(z+1) ){    z[0] = 'i';  }  /* Step 2 */  switch( z[1] ){   case 'a':     stem(&z, "lanoita", "ate", m_gt_0) ||     stem(&z, "lanoit", "tion", m_gt_0);     break;   case 'c':     stem(&z, "icne", "ence", m_gt_0) ||     stem(&z, "icna", "ance", m_gt_0);     break;   case 'e':     stem(&z, "rezi", "ize", m_gt_0);     break;   case 'g':     stem(&z, "igol", "log", m_gt_0);     break;   case 'l':     stem(&z, "ilb", "ble", m_gt_0) ||     stem(&z, "illa", "al", m_gt_0) ||     stem(&z, "iltne", "ent", m_gt_0) ||     stem(&z, "ile", "e", m_gt_0) ||     stem(&z, "ilsuo", "ous", m_gt_0);     break;   case 'o':     stem(&z, "noitazi", "ize", m_gt_0) ||     stem(&z, "noita", "ate", m_gt_0) ||     stem(&z, "rota", "ate", m_gt_0);     break;   case 's':     stem(&z, "msila", "al", m_gt_0) ||     stem(&z, "ssenevi", "ive", m_gt_0) ||     stem(&z, "ssenluf", "ful", m_gt_0) ||     stem(&z, "ssensuo", "ous", m_gt_0);     break;   case 't':     stem(&z, "itila", "al", m_gt_0) ||     stem(&z, "itivi", "ive", m_gt_0) ||     stem(&z, "itilib", "ble", m_gt_0);     break;  }  /* Step 3 */  switch( z[0] ){   case 'e':     stem(&z, "etaci", "ic", m_gt_0) ||     stem(&z, "evita", "", m_gt_0)   ||     stem(&z, "ezila", "al", m_gt_0);     break;   case 'i':     stem(&z, "itici", "ic", m_gt_0);     break;   case 'l':     stem(&z, "laci", "ic", m_gt_0) ||     stem(&z, "luf", "", m_gt_0);     break;   case 's':     stem(&z, "ssen", "", m_gt_0);     break;  }  /* Step 4 */  switch( z[1] ){   case 'a':     if( z[0]=='l' && m_gt_1(z+2) ){       z += 2;     }     break;   case 'c':     if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e')  && m_gt_1(z+4)  ){       z += 4;     }     break;   case 'e':     if( z[0]=='r' && m_gt_1(z+2) ){       z += 2;     }     break;   case 'i':     if( z[0]=='c' && m_gt_1(z+2) ){       z += 2;     }     break;   case 'l':     if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){       z += 4;     }     break;   case 'n':     if( z[0]=='t' ){       if( z[2]=='a' ){         if( m_gt_1(z+3) ){           z += 3;         }       }else if( z[2]=='e' ){         stem(&z, "tneme", "", m_gt_1) ||         stem(&z, "tnem", "", m_gt_1) ||         stem(&z, "tne", "", m_gt_1);       }     }     break;   case 'o':     if( z[0]=='u' ){       if( m_gt_1(z+2) ){         z += 2;       }     }else if( z[3]=='s' || z[3]=='t' ){       stem(&z, "noi", "", m_gt_1);     }     break;   case 's':     if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){       z += 3;     }     break;   case 't':     stem(&z, "eta", "", m_gt_1) ||     stem(&z, "iti", "", m_gt_1);     break;   case 'u':     if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){       z += 3;     }     break;   case 'v':   case 'z':     if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){       z += 3;     }     break;  }  /* Step 5a */  if( z[0]=='e' ){    if( m_gt_1(z+1) ){      z++;    }else if( m_eq_1(z+1) && !star_oh(z+1) ){      z++;    }  }  /* Step 5b */  if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){    z++;  }  /* z[] is now the stemmed word in reverse order.  Flip it back  ** around into forward order and return.  */  *pnOut = i = strlen(z);  zOut[i] = 0;  while( *z ){    zOut[--i] = *(z++);  }}/*** Characters that can be part of a token.  We assume any character** whose value is greater than 0x80 (any UTF character) can be** part of a token.  In other words, delimiters all must have** values of 0x7f or lower.*/static const char porterIdChar[] = {/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */};#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))/*** Extract the next token from a tokenization cursor.  The cursor must** have been opened by a prior call to porterOpen().*/static int porterNext(  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by porterOpen */  const char **pzToken,               /* OUT: *pzToken is the token text */  int *pnBytes,                       /* OUT: Number of bytes in token */  int *piStartOffset,                 /* OUT: Starting offset of token */  int *piEndOffset,                   /* OUT: Ending offset of token */  int *piPosition                     /* OUT: Position integer of token */){  porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;  const char *z = c->zInput;  while( c->iOffset<c->nInput ){    int iStartOffset, ch;    /* Scan past delimiter characters */    while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){      c->iOffset++;    }    /* Count non-delimiter characters. */    iStartOffset = c->iOffset;    while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){      c->iOffset++;    }    if( c->iOffset>iStartOffset ){      int n = c->iOffset-iStartOffset;      if( n>c->nAllocated ){        c->nAllocated = n+20;        c->zToken = realloc(c->zToken, c->nAllocated);        if( c->zToken==NULL ) return SQLITE_NOMEM;      }      porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);      *pzToken = c->zToken;      *piStartOffset = iStartOffset;      *piEndOffset = c->iOffset;      *piPosition = c->iToken++;      return SQLITE_OK;    }  }  return SQLITE_DONE;}/*** The set of routines that implement the porter-stemmer tokenizer*/static const sqlite3_tokenizer_module porterTokenizerModule = {  0,  porterCreate,  porterDestroy,  porterOpen,  porterClose,  porterNext,};/*** Allocate a new porter tokenizer.  Return a pointer to the new** tokenizer in *ppModule*/void sqlite3Fts2PorterTokenizerModule(  sqlite3_tokenizer_module const**ppModule){  *ppModule = &porterTokenizerModule;}#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
上一页 12
💿 文件大小 2186 K
👤 上传用户 hahuhuhuhu
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#sqlite #嵌入式数据库 #发展 #开源
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -