📄 fulltext.c

📁 最新的sqlite3.6.2源代码
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* The author disclaims copyright to this source code. * * This is an SQLite module implementing full-text search. */#include <assert.h>#if !defined(__APPLE__)#include <malloc.h>#else#include <stdlib.h>#endif#include <stdio.h>#include <string.h>#include <ctype.h>#include "fulltext.h"#include "ft_hash.h"#include "tokenizer.h"#include "sqlite3.h"#include "sqlite3ext.h"SQLITE_EXTENSION_INIT1/* utility functions *//* We encode variable-length integers in little-endian order using seven bits * per byte as follows:**** KEY:**         A = 0xxxxxxx    7 bits of data and one flag bit**         B = 1xxxxxxx    7 bits of data and one flag bit****  7 bits - A** 14 bits - BA** 21 bits - BBA** and so on.*//* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */#define VARINT_MAX 10/* Write a 64-bit variable-length integer to memory starting at p[0]. * The length of data written will be between 1 and VARINT_MAX bytes. * The number of bytes written is returned. */static int putVarint(char *p, sqlite_int64 v){  unsigned char *q = (unsigned char *) p;  sqlite_uint64 vu = v;  do{    *q++ = (unsigned char) ((vu & 0x7f) | 0x80);    vu >>= 7;  }while( vu!=0 );  q[-1] &= 0x7f;  /* turn off high bit in final byte */  assert( q - (unsigned char *)p <= VARINT_MAX );  return (int) (q - (unsigned char *)p);}/* Read a 64-bit variable-length integer from memory starting at p[0]. * Return the number of bytes read, or 0 on error. * The value is stored in *v. */static int getVarint(const char *p, sqlite_int64 *v){  const unsigned char *q = (const unsigned char *) p;  sqlite_uint64 x = 0, y = 1;  while( (*q & 0x80) == 0x80 ){    x += y * (*q++ & 0x7f);    y <<= 7;    if( q - (unsigned char *)p >= VARINT_MAX ){  /* bad data */      assert( 0 );      return 0;    }  }  x += y * (*q++);  *v = (sqlite_int64) x;  return (int) (q - (unsigned char *)p);}static int getVarint32(const char *p, int *pi){ sqlite_int64 i; int ret = getVarint(p, &i); *pi = (int) i; assert( *pi==i ); return ret;}/*** Document lists *** * * A document list holds a sorted list of varint-encoded document IDs. * * A doclist with type DL_POSITIONS_OFFSETS is stored like this: * * array { *   varint docid; *   array { *     varint position;     (delta from previous position plus 1, or 0 for end) *     varint startOffset;  (delta from previous startOffset) *     varint endOffset;    (delta from startOffset) *   } * } * * Here, array { X } means zero or more occurrences of X, adjacent in memory. * * A doclist with type DL_POSITIONS is like the above, but holds only docids * and positions without offset information. * * A doclist with type DL_DOCIDS is like the above, but holds only docids * without positions or offset information. * * On disk, every document list has positions and offsets, so we don't bother * to serialize a doclist's type. *  * We don't yet delta-encode document IDs; doing so will probably be a * modest win. * * NOTE(shess) I've thought of a slightly (1%) better offset encoding. * After the first offset, estimate the next offset by using the * current token position and the previous token position and offset, * offset to handle some variance.  So the estimate would be * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded * as normal.  Offsets more than 64 chars from the estimate are * encoded as the delta to the previous start offset + 128.  An * additional tiny increment can be gained by using the end offset of * the previous token to make the estimate a tiny bit more precise.*/typedef enum DocListType {  DL_DOCIDS,              /* docids only */  DL_POSITIONS,           /* docids + positions */  DL_POSITIONS_OFFSETS    /* docids + positions + offsets */} DocListType;typedef struct DocList {  char *pData;  int nData;  DocListType iType;  int iLastPos;       /* the last position written */  int iLastOffset;    /* the last start offset written */} DocList;/* Initialize a new DocList to hold the given data. */static void docListInit(DocList *d, DocListType iType,                        const char *pData, int nData){  d->nData = nData;  if( nData>0 ){    d->pData = malloc(nData);    memcpy(d->pData, pData, nData);  } else {    d->pData = NULL;  }  d->iType = iType;  d->iLastPos = 0;  d->iLastOffset = 0;}/* Create a new dynamically-allocated DocList. */static DocList *docListNew(DocListType iType){  DocList *d = (DocList *) malloc(sizeof(DocList));  docListInit(d, iType, 0, 0);  return d;}static void docListDestroy(DocList *d){  free(d->pData);#ifndef NDEBUG  memset(d, 0x55, sizeof(*d));#endif}static void docListDelete(DocList *d){  docListDestroy(d);  free(d);}static char *docListEnd(DocList *d){  return d->pData + d->nData;}/* Append a varint to a DocList's data. */static void appendVarint(DocList *d, sqlite_int64 i){  char c[VARINT_MAX];  int n = putVarint(c, i);  d->pData = realloc(d->pData, d->nData + n);  memcpy(d->pData + d->nData, c, n);  d->nData += n;}static void docListAddDocid(DocList *d, sqlite_int64 iDocid){  appendVarint(d, iDocid);  d->iLastPos = 0;}/* Add a position to the last position list in a doclist. */static void docListAddPos(DocList *d, int iPos){  assert( d->iType>=DL_POSITIONS );  appendVarint(d, iPos-d->iLastPos+1);  d->iLastPos = iPos;}static void docListAddPosOffset(DocList *d, int iPos,                                int iStartOffset, int iEndOffset){  assert( d->iType==DL_POSITIONS_OFFSETS );  docListAddPos(d, iPos);  appendVarint(d, iStartOffset-d->iLastOffset);  d->iLastOffset = iStartOffset;  appendVarint(d, iEndOffset-iStartOffset);}/* Terminate the last position list in the given doclist. */static void docListAddEndPos(DocList *d){  appendVarint(d, 0);}typedef struct DocListReader {  DocList *pDoclist;  char *p;  int iLastPos;    /* the last position read */} DocListReader;static void readerInit(DocListReader *r, DocList *pDoclist){  r->pDoclist = pDoclist;  if( pDoclist!=NULL ){    r->p = pDoclist->pData;  }  r->iLastPos = 0;}static int readerAtEnd(DocListReader *pReader){  return pReader->p >= docListEnd(pReader->pDoclist);}/* Peek at the next docid without advancing the read pointer. */static sqlite_int64 peekDocid(DocListReader *pReader){  sqlite_int64 ret;  assert( !readerAtEnd(pReader) );  getVarint(pReader->p, &ret);  return ret;}/* Read the next docid. */static sqlite_int64 readDocid(DocListReader *pReader){  sqlite_int64 ret;  assert( !readerAtEnd(pReader) );  pReader->p += getVarint(pReader->p, &ret);  pReader->iLastPos = 0;  return ret;}/* Read the next position from a position list. * Returns the position, or -1 at the end of the list. */static int readPosition(DocListReader *pReader){  int i;  int iType = pReader->pDoclist->iType;  assert( iType>=DL_POSITIONS );  assert( !readerAtEnd(pReader) );  pReader->p += getVarint32(pReader->p, &i);  if( i==0 ){    pReader->iLastPos = -1;    return -1;  }  pReader->iLastPos += ((int) i)-1;  if( iType>=DL_POSITIONS_OFFSETS ){    /* Skip over offsets, ignoring them for now. */    int iStart, iEnd;    pReader->p += getVarint32(pReader->p, &iStart);    pReader->p += getVarint32(pReader->p, &iEnd);  }  return pReader->iLastPos;}/* Skip past the end of a position list. */static void skipPositionList(DocListReader *pReader){  while( readPosition(pReader)!=-1 )    ;}/* Skip over a docid, including its position list if the doclist has * positions. */static void skipDocument(DocListReader *pReader){  readDocid(pReader);  if( pReader->pDoclist->iType >= DL_POSITIONS ){    skipPositionList(pReader);  }}static sqlite_int64 firstDocid(DocList *d){  DocListReader r;  readerInit(&r, d);  return readDocid(&r);}/* Doclist multi-tool.  Pass pUpdate==NULL to delete the indicated docid; * otherwise pUpdate, which must contain only the single docid [iDocid], is * inserted (if not present) or updated (if already present). */static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){  int modified = 0;  DocListReader reader;  char *p;  if( pUpdate!=NULL ){    assert( d->iType==pUpdate->iType);    assert( iDocid==firstDocid(pUpdate) );  }  readerInit(&reader, d);  while( !readerAtEnd(&reader) && peekDocid(&reader)<iDocid ){    skipDocument(&reader);  }  p = reader.p;  /* Delete if there is a matching element. */  if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){    skipDocument(&reader);    memmove(p, reader.p, docListEnd(d) - reader.p);    d->nData -= (reader.p - p);    modified = 1;  }  /* Insert if indicated. */  if( pUpdate!=NULL ){    int iDoclist = p-d->pData;    docListAddEndPos(pUpdate);    d->pData = realloc(d->pData, d->nData+pUpdate->nData);    p = d->pData + iDoclist;    memmove(p+pUpdate->nData, p, docListEnd(d) - p);    memcpy(p, pUpdate->pData, pUpdate->nData);    d->nData += pUpdate->nData;    modified = 1;  }  return modified;}/* Split the second half of doclist d into a separate doclist d2.  Returns 1 * if successful, or 0 if d contains a single document and hence can't be * split. */static int docListSplit(DocList *d, DocList *d2){  const char *pSplitPoint = d->pData + d->nData / 2;  DocListReader reader;  readerInit(&reader, d);  while( reader.p<pSplitPoint ){    skipDocument(&reader);  }  if( readerAtEnd(&reader) ) return 0;  docListInit(d2, d->iType, reader.p, docListEnd(d) - reader.p);  d->nData = reader.p - d->pData;  d->pData = realloc(d->pData, d->nData);  return 1;}/* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked * on-disk doclist, resulting in another in-memory DocList [out].  [in] * and [out] may or may not store position information according to the * caller's wishes.  The on-disk doclist always comes with positions. * * The caller must read each chunk of the on-disk doclist in succession and * pass it to mergeBlock(). * * If [in] has positions, then the merge output contains only documents with * matching positions in the two input doclists.  If [in] does not have * positions, then the merge output contains all documents common to the two * input doclists. * * If [in] is NULL, then the on-disk doclist is copied to [out] directly. * * A merge is performed using an integer [iOffset] provided by the caller. * [iOffset] is subtracted from each position in the on-disk doclist for the * purpose of position comparison; this is helpful in implementing phrase * searches. * * A DocListMerge is not yet able to propagate offsets through query * processing; we should add that capability soon.*/typedef struct DocListMerge {  DocListReader in;  DocList *pOut;  int iOffset;} DocListMerge;static void mergeInit(DocListMerge *m,                      DocList *pIn, int iOffset, DocList *pOut){  readerInit(&m->in, pIn);  m->pOut = pOut;  m->iOffset = iOffset;  /* can't handle offsets yet */  assert( pIn==NULL || pIn->iType <= DL_POSITIONS );  assert( pOut->iType <= DL_POSITIONS );}/* A helper function for mergeBlock(), below.  Merge the position lists * pointed to by m->in and pBlockReader. * If the merge matches, write [iDocid] to m->pOut; if m->pOut * has positions then write all matching positions as well. */static void mergePosList(DocListMerge *m, sqlite_int64 iDocid,                  DocListReader *pBlockReader){  int block_pos = readPosition(pBlockReader);  int in_pos = readPosition(&m->in);  int match = 0;  while( block_pos!=-1 || in_pos!=-1 ){    if( block_pos-m->iOffset==in_pos ){      if( !match ){        docListAddDocid(m->pOut, iDocid);        match = 1;      }      if( m->pOut->iType >= DL_POSITIONS ){        docListAddPos(m->pOut, in_pos);      }      block_pos = readPosition(pBlockReader);      in_pos = readPosition(&m->in);    } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffset<in_pos) ){      block_pos = readPosition(pBlockReader);    } else {      in_pos = readPosition(&m->in);    }  }  if( m->pOut->iType >= DL_POSITIONS && match ){    docListAddEndPos(m->pOut);  }}/* Merge one block of an on-disk doclist into a DocListMerge. */static void mergeBlock(DocListMerge *m, DocList *pBlock){  DocListReader blockReader;  assert( pBlock->iType >= DL_POSITIONS );  readerInit(&blockReader, pBlock);  while( !readerAtEnd(&blockReader) ){    sqlite_int64 iDocid = readDocid(&blockReader);    if( m->in.pDoclist!=NULL ){      while( 1 ){        if( readerAtEnd(&m->in) ) return;  /* nothing more to merge */        if( peekDocid(&m->in)>=iDocid ) break;        skipDocument(&m->in);      }      if( peekDocid(&m->in)>iDocid ){  /* [pIn] has no match with iDocid */        skipPositionList(&blockReader);  /* skip this docid in the block */        continue;      }      readDocid(&m->in);    }    /* We have a document match. */    if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){      /* We don't need to do a poslist merge. */      docListAddDocid(m->pOut, iDocid);      if( m->pOut->iType >= DL_POSITIONS ){        /* Copy all positions to the output doclist. */        while( 1 ){          int pos = readPosition(&blockReader);          if( pos==-1 ) break;          docListAddPos(m->pOut, pos);        }        docListAddEndPos(m->pOut);      } else skipPositionList(&blockReader);      continue;    }    mergePosList(m, iDocid, &blockReader);  }}static char *string_dup_n(const char *s, int n){  char *str = malloc(n + 1);  memcpy(str, s, n);  str[n] = '\0';  return str;}/* Duplicate a string; the caller must free() the returned string. * (We don't use strdup() since it's not part of the standard C library and * may not be available everywhere.) */static char *string_dup(const char *s){  return string_dup_n(s, strlen(s));}/* Format a string, replacing each occurrence of the % character with * zName.  This may be more convenient than sqlite_mprintf() * when one string is used repeatedly in a format string. * The caller must free() the returned string. */static char *string_format(const char *zFormat, const char *zName){  const char *p;  size_t len = 0;  size_t nName = strlen(zName);  char *result;  char *r;  /* first compute length needed */  for(p = zFormat ; *p ; ++p){    len += (*p=='%' ? nName : 1);  }  len += 1;  /* for null terminator */  r = result = malloc(len);  for(p = zFormat; *p; ++p){    if( *p=='%' ){      memcpy(r, zName, nName);      r += nName;    } else {      *r++ = *p;    }  }
12 3 下一页
💿 文件大小 2259 K
👤 上传用户 dante55
📂 所属分类其他数据库
🏷️ 相关标签

#sqlite #源代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -