📄 fts1_tokenizer1.c

📁 sqlite-3.4.1,嵌入式数据库.是一个功能强大的开源数据库,给学习和研发以及小型公司的发展带来了全所未有的好处.
💻 C
字号:
/*** The author disclaims copyright to this source code.***************************************************************************** Implementation of the "simple" full-text-search tokenizer.*//*** The code in this file is only compiled if:****     * The FTS1 module is being built as an extension**       (in which case SQLITE_CORE is not defined), or****     * The FTS1 module is being built into the core of**       SQLite (in which case SQLITE_ENABLE_FTS1 is defined).*/#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)#include <assert.h>#if !defined(__APPLE__)#include <malloc.h>#else#include <stdlib.h>#endif#include <stdio.h>#include <string.h>#include <ctype.h>#include "fts1_tokenizer.h"typedef struct simple_tokenizer {  sqlite3_tokenizer base;  char delim[128];             /* flag ASCII delimiters */} simple_tokenizer;typedef struct simple_tokenizer_cursor {  sqlite3_tokenizer_cursor base;  const char *pInput;          /* input we are tokenizing */  int nBytes;                  /* size of the input */  int iOffset;                 /* current position in pInput */  int iToken;                  /* index of next token to be returned */  char *pToken;                /* storage for current token */  int nTokenAllocated;         /* space allocated to zToken buffer */} simple_tokenizer_cursor;/* Forward declaration */static const sqlite3_tokenizer_module simpleTokenizerModule;static int isDelim(simple_tokenizer *t, unsigned char c){  return c<0x80 && t->delim[c];}/*** Create a new tokenizer instance.*/static int simpleCreate(  int argc, const char * const *argv,  sqlite3_tokenizer **ppTokenizer){  simple_tokenizer *t;  t = (simple_tokenizer *) calloc(sizeof(*t), 1);  if( t==NULL ) return SQLITE_NOMEM;  /* TODO(shess) Delimiters need to remain the same from run to run,  ** else we need to reindex.  One solution would be a meta-table to  ** track such information in the database, then we'd only want this  ** information on the initial create.  */  if( argc>1 ){    int i, n = strlen(argv[1]);    for(i=0; i<n; i++){      unsigned char ch = argv[1][i];      /* We explicitly don't support UTF-8 delimiters for now. */      if( ch>=0x80 ){        free(t);        return SQLITE_ERROR;      }      t->delim[ch] = 1;    }  } else {    /* Mark non-alphanumeric ASCII characters as delimiters */    int i;    for(i=1; i<0x80; i++){      t->delim[i] = !isalnum(i);    }  }  *ppTokenizer = &t->base;  return SQLITE_OK;}/*** Destroy a tokenizer*/static int simpleDestroy(sqlite3_tokenizer *pTokenizer){  free(pTokenizer);  return SQLITE_OK;}/*** Prepare to begin tokenizing a particular string.  The input** string to be tokenized is pInput[0..nBytes-1].  A cursor** used to incrementally tokenize this string is returned in ** *ppCursor.*/static int simpleOpen(  sqlite3_tokenizer *pTokenizer,         /* The tokenizer */  const char *pInput, int nBytes,        /* String to be tokenized */  sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */){  simple_tokenizer_cursor *c;  c = (simple_tokenizer_cursor *) malloc(sizeof(*c));  if( c==NULL ) return SQLITE_NOMEM;  c->pInput = pInput;  if( pInput==0 ){    c->nBytes = 0;  }else if( nBytes<0 ){    c->nBytes = (int)strlen(pInput);  }else{    c->nBytes = nBytes;  }  c->iOffset = 0;                 /* start tokenizing at the beginning */  c->iToken = 0;  c->pToken = NULL;               /* no space allocated, yet. */  c->nTokenAllocated = 0;  *ppCursor = &c->base;  return SQLITE_OK;}/*** Close a tokenization cursor previously opened by a call to** simpleOpen() above.*/static int simpleClose(sqlite3_tokenizer_cursor *pCursor){  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;  free(c->pToken);  free(c);  return SQLITE_OK;}/*** Extract the next token from a tokenization cursor.  The cursor must** have been opened by a prior call to simpleOpen().*/static int simpleNext(  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */  const char **ppToken,               /* OUT: *ppToken is the token text */  int *pnBytes,                       /* OUT: Number of bytes in token */  int *piStartOffset,                 /* OUT: Starting offset of token */  int *piEndOffset,                   /* OUT: Ending offset of token */  int *piPosition                     /* OUT: Position integer of token */){  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;  simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;  unsigned char *p = (unsigned char *)c->pInput;  while( c->iOffset<c->nBytes ){    int iStartOffset;    /* Scan past delimiter characters */    while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){      c->iOffset++;    }    /* Count non-delimiter characters. */    iStartOffset = c->iOffset;    while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){      c->iOffset++;    }    if( c->iOffset>iStartOffset ){      int i, n = c->iOffset-iStartOffset;      if( n>c->nTokenAllocated ){        c->nTokenAllocated = n+20;        c->pToken = realloc(c->pToken, c->nTokenAllocated);        if( c->pToken==NULL ) return SQLITE_NOMEM;      }      for(i=0; i<n; i++){        /* TODO(shess) This needs expansion to handle UTF-8        ** case-insensitivity.        */        unsigned char ch = p[iStartOffset+i];        c->pToken[i] = ch<0x80 ? tolower(ch) : ch;      }      *ppToken = c->pToken;      *pnBytes = n;      *piStartOffset = iStartOffset;      *piEndOffset = c->iOffset;      *piPosition = c->iToken++;      return SQLITE_OK;    }  }  return SQLITE_DONE;}/*** The set of routines that implement the simple tokenizer*/static const sqlite3_tokenizer_module simpleTokenizerModule = {  0,  simpleCreate,  simpleDestroy,  simpleOpen,  simpleClose,  simpleNext,};/*** Allocate a new simple tokenizer.  Return a pointer to the new** tokenizer in *ppModule*/void sqlite3Fts1SimpleTokenizerModule(  sqlite3_tokenizer_module const**ppModule){  *ppModule = &simpleTokenizerModule;}#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -