📄 fts1.c
字号:
/* The author disclaims copyright to this source code. * * This is an SQLite module implementing full-text search. *//*** The code in this file is only compiled if:**** * The FTS1 module is being built as an extension** (in which case SQLITE_CORE is not defined), or**** * The FTS1 module is being built into the core of** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).*/#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)#if defined(SQLITE_ENABLE_FTS1) && !defined(SQLITE_CORE)# define SQLITE_CORE 1#endif#include <assert.h>#if !defined(__APPLE__)#include <malloc.h>#else#include <stdlib.h>#endif#include <stdio.h>#include <string.h>#include <ctype.h>#include "fts1.h"#include "fts1_hash.h"#include "fts1_tokenizer.h"#include "sqlite3.h"#include "sqlite3ext.h"SQLITE_EXTENSION_INIT1#if 0# define TRACE(A) printf A; fflush(stdout)#else# define TRACE(A)#endif/* utility functions */typedef struct StringBuffer { int len; /* length, not including null terminator */ int alloced; /* Space allocated for s[] */ char *s; /* Content of the string */} StringBuffer;static void initStringBuffer(StringBuffer *sb){ sb->len = 0; sb->alloced = 100; sb->s = malloc(100); sb->s[0] = '\0';}static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){ if( sb->len + nFrom >= sb->alloced ){ sb->alloced = sb->len + nFrom + 100; sb->s = realloc(sb->s, sb->alloced+1); if( sb->s==0 ){ initStringBuffer(sb); return; } } memcpy(sb->s + sb->len, zFrom, nFrom); sb->len += nFrom; sb->s[sb->len] = 0;}static void append(StringBuffer *sb, const char *zFrom){ nappend(sb, zFrom, strlen(zFrom));}/* We encode variable-length integers in little-endian order using seven bits * per byte as follows:**** KEY:** A = 0xxxxxxx 7 bits of data and one flag bit** B = 1xxxxxxx 7 bits of data and one flag bit**** 7 bits - A** 14 bits - BA** 21 bits - BBA** and so on.*//* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */#define VARINT_MAX 10/* Write a 64-bit variable-length integer to memory starting at p[0]. * The length of data written will be between 1 and VARINT_MAX bytes. * The number of bytes written is returned. */static int putVarint(char *p, sqlite_int64 v){ unsigned char *q = (unsigned char *) p; sqlite_uint64 vu = v; do{ *q++ = (unsigned char) ((vu & 0x7f) | 0x80); vu >>= 7; }while( vu!=0 ); q[-1] &= 0x7f; /* turn off high bit in final byte */ assert( q - (unsigned char *)p <= VARINT_MAX ); return (int) (q - (unsigned char *)p);}/* Read a 64-bit variable-length integer from memory starting at p[0]. * Return the number of bytes read, or 0 on error. * The value is stored in *v. */static int getVarint(const char *p, sqlite_int64 *v){ const unsigned char *q = (const unsigned char *) p; sqlite_uint64 x = 0, y = 1; while( (*q & 0x80) == 0x80 ){ x += y * (*q++ & 0x7f); y <<= 7; if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ assert( 0 ); return 0; } } x += y * (*q++); *v = (sqlite_int64) x; return (int) (q - (unsigned char *)p);}static int getVarint32(const char *p, int *pi){ sqlite_int64 i; int ret = getVarint(p, &i); *pi = (int) i; assert( *pi==i ); return ret;}/*** Document lists *** * * A document list holds a sorted list of varint-encoded document IDs. * * A doclist with type DL_POSITIONS_OFFSETS is stored like this: * * array { * varint docid; * array { * varint position; (delta from previous position plus POS_BASE) * varint startOffset; (delta from previous startOffset) * varint endOffset; (delta from startOffset) * } * } * * Here, array { X } means zero or more occurrences of X, adjacent in memory. * * A position list may hold positions for text in multiple columns. A position * POS_COLUMN is followed by a varint containing the index of the column for * following positions in the list. Any positions appearing before any * occurrences of POS_COLUMN are for column 0. * * A doclist with type DL_POSITIONS is like the above, but holds only docids * and positions without offset information. * * A doclist with type DL_DOCIDS is like the above, but holds only docids * without positions or offset information. * * On disk, every document list has positions and offsets, so we don't bother * to serialize a doclist's type. * * We don't yet delta-encode document IDs; doing so will probably be a * modest win. * * NOTE(shess) I've thought of a slightly (1%) better offset encoding. * After the first offset, estimate the next offset by using the * current token position and the previous token position and offset, * offset to handle some variance. So the estimate would be * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded * as normal. Offsets more than 64 chars from the estimate are * encoded as the delta to the previous start offset + 128. An * additional tiny increment can be gained by using the end offset of * the previous token to make the estimate a tiny bit more precise.*//* It is not safe to call isspace(), tolower(), or isalnum() on** hi-bit-set characters. This is the same solution used in the** tokenizer.*//* TODO(shess) The snippet-generation code should be using the** tokenizer-generated tokens rather than doing its own local** tokenization.*//* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */static int safe_isspace(char c){ return (c&0x80)==0 ? isspace(c) : 0;}static int safe_tolower(char c){ return (c&0x80)==0 ? tolower(c) : c;}static int safe_isalnum(char c){ return (c&0x80)==0 ? isalnum(c) : 0;}typedef enum DocListType { DL_DOCIDS, /* docids only */ DL_POSITIONS, /* docids + positions */ DL_POSITIONS_OFFSETS /* docids + positions + offsets */} DocListType;/*** By default, only positions and not offsets are stored in the doclists.** To change this so that offsets are stored too, compile with**** -DDL_DEFAULT=DL_POSITIONS_OFFSETS***/#ifndef DL_DEFAULT# define DL_DEFAULT DL_POSITIONS#endiftypedef struct DocList { char *pData; int nData; DocListType iType; int iLastColumn; /* the last column written */ int iLastPos; /* the last position written */ int iLastOffset; /* the last start offset written */} DocList;enum { POS_END = 0, /* end of this position list */ POS_COLUMN, /* followed by new column number */ POS_BASE};/* Initialize a new DocList to hold the given data. */static void docListInit(DocList *d, DocListType iType, const char *pData, int nData){ d->nData = nData; if( nData>0 ){ d->pData = malloc(nData); memcpy(d->pData, pData, nData); } else { d->pData = NULL; } d->iType = iType; d->iLastColumn = 0; d->iLastPos = d->iLastOffset = 0;}/* Create a new dynamically-allocated DocList. */static DocList *docListNew(DocListType iType){ DocList *d = (DocList *) malloc(sizeof(DocList)); docListInit(d, iType, 0, 0); return d;}static void docListDestroy(DocList *d){ free(d->pData);#ifndef NDEBUG memset(d, 0x55, sizeof(*d));#endif}static void docListDelete(DocList *d){ docListDestroy(d); free(d);}static char *docListEnd(DocList *d){ return d->pData + d->nData;}/* Append a varint to a DocList's data. */static void appendVarint(DocList *d, sqlite_int64 i){ char c[VARINT_MAX]; int n = putVarint(c, i); d->pData = realloc(d->pData, d->nData + n); memcpy(d->pData + d->nData, c, n); d->nData += n;}static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ appendVarint(d, iDocid); if( d->iType>=DL_POSITIONS ){ appendVarint(d, POS_END); /* initially empty position list */ d->iLastColumn = 0; d->iLastPos = d->iLastOffset = 0; }}/* helper function for docListAddPos and docListAddPosOffset */static void addPos(DocList *d, int iColumn, int iPos){ assert( d->nData>0 ); --d->nData; /* remove previous terminator */ if( iColumn!=d->iLastColumn ){ assert( iColumn>d->iLastColumn ); appendVarint(d, POS_COLUMN); appendVarint(d, iColumn); d->iLastColumn = iColumn; d->iLastPos = d->iLastOffset = 0; } assert( iPos>=d->iLastPos ); appendVarint(d, iPos-d->iLastPos+POS_BASE); d->iLastPos = iPos;}/* Add a position to the last position list in a doclist. */static void docListAddPos(DocList *d, int iColumn, int iPos){ assert( d->iType==DL_POSITIONS ); addPos(d, iColumn, iPos); appendVarint(d, POS_END); /* add new terminator */}/*** Add a position and starting and ending offsets to a doclist.**** If the doclist is setup to handle only positions, then insert** the position only and ignore the offsets.*/static void docListAddPosOffset( DocList *d, /* Doclist under construction */ int iColumn, /* Column the inserted term is part of */ int iPos, /* Position of the inserted term */ int iStartOffset, /* Starting offset of inserted term */ int iEndOffset /* Ending offset of inserted term */){ assert( d->iType>=DL_POSITIONS ); addPos(d, iColumn, iPos); if( d->iType==DL_POSITIONS_OFFSETS ){ assert( iStartOffset>=d->iLastOffset ); appendVarint(d, iStartOffset-d->iLastOffset); d->iLastOffset = iStartOffset; assert( iEndOffset>=iStartOffset ); appendVarint(d, iEndOffset-iStartOffset); } appendVarint(d, POS_END); /* add new terminator */}/*** A DocListReader object is a cursor into a doclist. Initialize** the cursor to the beginning of the doclist by calling readerInit().** Then use routines**** peekDocid()** readDocid()** readPosition()** skipPositionList()** and so forth...**** to read information out of the doclist. When we reach the end** of the doclist, atEnd() returns TRUE.*/typedef struct DocListReader { DocList *pDoclist; /* The document list we are stepping through */ char *p; /* Pointer to next unread byte in the doclist */ int iLastColumn; int iLastPos; /* the last position read, or -1 when not in a position list */} DocListReader;/*** Initialize the DocListReader r to point to the beginning of pDoclist.*/static void readerInit(DocListReader *r, DocList *pDoclist){ r->pDoclist = pDoclist; if( pDoclist!=NULL ){ r->p = pDoclist->pData; } r->iLastColumn = -1; r->iLastPos = -1;}/*** Return TRUE if we have reached then end of pReader and there is** nothing else left to read.*/static int atEnd(DocListReader *pReader){ return pReader->pDoclist==0 || (pReader->p >= docListEnd(pReader->pDoclist));}/* Peek at the next docid without advancing the read pointer. */static sqlite_int64 peekDocid(DocListReader *pReader){ sqlite_int64 ret; assert( !atEnd(pReader) ); assert( pReader->iLastPos==-1 ); getVarint(pReader->p, &ret); return ret;}/* Read the next docid. See also nextDocid().*/static sqlite_int64 readDocid(DocListReader *pReader){ sqlite_int64 ret; assert( !atEnd(pReader) ); assert( pReader->iLastPos==-1 ); pReader->p += getVarint(pReader->p, &ret); if( pReader->pDoclist->iType>=DL_POSITIONS ){ pReader->iLastColumn = 0; pReader->iLastPos = 0; } return ret;}/* Read the next position and column index from a position list. * Returns the position, or -1 at the end of the list. */static int readPosition(DocListReader *pReader, int *iColumn){ int i; int iType = pReader->pDoclist->iType; if( pReader->iLastPos==-1 ){ return -1; } assert( !atEnd(pReader) ); if( iType<DL_POSITIONS ){ return -1; } pReader->p += getVarint32(pReader->p, &i); if( i==POS_END ){ pReader->iLastColumn = pReader->iLastPos = -1; *iColumn = -1; return -1; } if( i==POS_COLUMN ){ pReader->p += getVarint32(pReader->p, &pReader->iLastColumn); pReader->iLastPos = 0; pReader->p += getVarint32(pReader->p, &i); assert( i>=POS_BASE ); } pReader->iLastPos += ((int) i)-POS_BASE; if( iType>=DL_POSITIONS_OFFSETS ){ /* Skip over offsets, ignoring them for now. */ int iStart, iEnd; pReader->p += getVarint32(pReader->p, &iStart); pReader->p += getVarint32(pReader->p, &iEnd); } *iColumn = pReader->iLastColumn;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -