📄 fts1.c
字号:
/* The author disclaims copyright to this source code.
*
* This is an SQLite module implementing full-text search.
*/
/*
** The code in this file is only compiled if:
**
** * The FTS1 module is being built as an extension
** (in which case SQLITE_CORE is not defined), or
**
** * The FTS1 module is being built into the core of
** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
*/
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
#if defined(SQLITE_ENABLE_FTS1) && !defined(SQLITE_CORE)
# define SQLITE_CORE 1
#endif
#include <assert.h>
#if !defined(__APPLE__)
#include <malloc.h>
#else
#include <stdlib.h>
#endif
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "fts1.h"
#include "fts1_hash.h"
#include "fts1_tokenizer.h"
#include "sqlite3.h"
#include "sqlite3ext.h"
SQLITE_EXTENSION_INIT1
#if 0
# define TRACE(A) printf A; fflush(stdout)
#else
# define TRACE(A)
#endif
/* utility functions */
typedef struct StringBuffer {
int len; /* length, not including null terminator */
int alloced; /* Space allocated for s[] */
char *s; /* Content of the string */
} StringBuffer;
void initStringBuffer(StringBuffer *sb){
sb->len = 0;
sb->alloced = 100;
sb->s = malloc(100);
sb->s[0] = '\0';
}
void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
if( sb->len + nFrom >= sb->alloced ){
sb->alloced = sb->len + nFrom + 100;
sb->s = realloc(sb->s, sb->alloced+1);
if( sb->s==0 ){
initStringBuffer(sb);
return;
}
}
memcpy(sb->s + sb->len, zFrom, nFrom);
sb->len += nFrom;
sb->s[sb->len] = 0;
}
void append(StringBuffer *sb, const char *zFrom){
nappend(sb, zFrom, strlen(zFrom));
}
/* We encode variable-length integers in little-endian order using seven bits
* per byte as follows:
**
** KEY:
** A = 0xxxxxxx 7 bits of data and one flag bit
** B = 1xxxxxxx 7 bits of data and one flag bit
**
** 7 bits - A
** 14 bits - BA
** 21 bits - BBA
** and so on.
*/
/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
#define VARINT_MAX 10
/* Write a 64-bit variable-length integer to memory starting at p[0].
* The length of data written will be between 1 and VARINT_MAX bytes.
* The number of bytes written is returned. */
static int putVarint(char *p, sqlite_int64 v){
unsigned char *q = (unsigned char *) p;
sqlite_uint64 vu = v;
do{
*q++ = (unsigned char) ((vu & 0x7f) | 0x80);
vu >>= 7;
}while( vu!=0 );
q[-1] &= 0x7f; /* turn off high bit in final byte */
assert( q - (unsigned char *)p <= VARINT_MAX );
return (int) (q - (unsigned char *)p);
}
/* Read a 64-bit variable-length integer from memory starting at p[0].
* Return the number of bytes read, or 0 on error.
* The value is stored in *v. */
static int getVarint(const char *p, sqlite_int64 *v){
const unsigned char *q = (const unsigned char *) p;
sqlite_uint64 x = 0, y = 1;
while( (*q & 0x80) == 0x80 ){
x += y * (*q++ & 0x7f);
y <<= 7;
if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
assert( 0 );
return 0;
}
}
x += y * (*q++);
*v = (sqlite_int64) x;
return (int) (q - (unsigned char *)p);
}
static int getVarint32(const char *p, int *pi){
sqlite_int64 i;
int ret = getVarint(p, &i);
*pi = (int) i;
assert( *pi==i );
return ret;
}
/*** Document lists ***
*
* A document list holds a sorted list of varint-encoded document IDs.
*
* A doclist with type DL_POSITIONS_OFFSETS is stored like this:
*
* array {
* varint docid;
* array {
* varint position; (delta from previous position plus POS_BASE)
* varint startOffset; (delta from previous startOffset)
* varint endOffset; (delta from startOffset)
* }
* }
*
* Here, array { X } means zero or more occurrences of X, adjacent in memory.
*
* A position list may hold positions for text in multiple columns. A position
* POS_COLUMN is followed by a varint containing the index of the column for
* following positions in the list. Any positions appearing before any
* occurrences of POS_COLUMN are for column 0.
*
* A doclist with type DL_POSITIONS is like the above, but holds only docids
* and positions without offset information.
*
* A doclist with type DL_DOCIDS is like the above, but holds only docids
* without positions or offset information.
*
* On disk, every document list has positions and offsets, so we don't bother
* to serialize a doclist's type.
*
* We don't yet delta-encode document IDs; doing so will probably be a
* modest win.
*
* NOTE(shess) I've thought of a slightly (1%) better offset encoding.
* After the first offset, estimate the next offset by using the
* current token position and the previous token position and offset,
* offset to handle some variance. So the estimate would be
* (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
* as normal. Offsets more than 64 chars from the estimate are
* encoded as the delta to the previous start offset + 128. An
* additional tiny increment can be gained by using the end offset of
* the previous token to make the estimate a tiny bit more precise.
*/
typedef enum DocListType {
DL_DOCIDS, /* docids only */
DL_POSITIONS, /* docids + positions */
DL_POSITIONS_OFFSETS /* docids + positions + offsets */
} DocListType;
/*
** By default, only positions and not offsets are stored in the doclists.
** To change this so that offsets are stored too, compile with
**
** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
**
*/
#ifndef DL_DEFAULT
# define DL_DEFAULT DL_POSITIONS
#endif
typedef struct DocList {
char *pData;
int nData;
DocListType iType;
int iLastColumn; /* the last column written */
int iLastPos; /* the last position written */
int iLastOffset; /* the last start offset written */
} DocList;
enum {
POS_END = 0, /* end of this position list */
POS_COLUMN, /* followed by new column number */
POS_BASE
};
/* Initialize a new DocList to hold the given data. */
static void docListInit(DocList *d, DocListType iType,
const char *pData, int nData){
d->nData = nData;
if( nData>0 ){
d->pData = malloc(nData);
memcpy(d->pData, pData, nData);
} else {
d->pData = NULL;
}
d->iType = iType;
d->iLastColumn = 0;
d->iLastPos = d->iLastOffset = 0;
}
/* Create a new dynamically-allocated DocList. */
static DocList *docListNew(DocListType iType){
DocList *d = (DocList *) malloc(sizeof(DocList));
docListInit(d, iType, 0, 0);
return d;
}
static void docListDestroy(DocList *d){
free(d->pData);
#ifndef NDEBUG
memset(d, 0x55, sizeof(*d));
#endif
}
static void docListDelete(DocList *d){
docListDestroy(d);
free(d);
}
static char *docListEnd(DocList *d){
return d->pData + d->nData;
}
/* Append a varint to a DocList's data. */
static void appendVarint(DocList *d, sqlite_int64 i){
char c[VARINT_MAX];
int n = putVarint(c, i);
d->pData = realloc(d->pData, d->nData + n);
memcpy(d->pData + d->nData, c, n);
d->nData += n;
}
static void docListAddDocid(DocList *d, sqlite_int64 iDocid){
appendVarint(d, iDocid);
if( d->iType>=DL_POSITIONS ){
appendVarint(d, POS_END); /* initially empty position list */
d->iLastColumn = 0;
d->iLastPos = d->iLastOffset = 0;
}
}
/* helper function for docListAddPos and docListAddPosOffset */
static void addPos(DocList *d, int iColumn, int iPos){
assert( d->nData>0 );
--d->nData; /* remove previous terminator */
if( iColumn!=d->iLastColumn ){
assert( iColumn>d->iLastColumn );
appendVarint(d, POS_COLUMN);
appendVarint(d, iColumn);
d->iLastColumn = iColumn;
d->iLastPos = d->iLastOffset = 0;
}
assert( iPos>=d->iLastPos );
appendVarint(d, iPos-d->iLastPos+POS_BASE);
d->iLastPos = iPos;
}
/* Add a position to the last position list in a doclist. */
static void docListAddPos(DocList *d, int iColumn, int iPos){
assert( d->iType==DL_POSITIONS );
addPos(d, iColumn, iPos);
appendVarint(d, POS_END); /* add new terminator */
}
/*
** Add a position and starting and ending offsets to a doclist.
**
** If the doclist is setup to handle only positions, then insert
** the position only and ignore the offsets.
*/
static void docListAddPosOffset(
DocList *d, /* Doclist under construction */
int iColumn, /* Column the inserted term is part of */
int iPos, /* Position of the inserted term */
int iStartOffset, /* Starting offset of inserted term */
int iEndOffset /* Ending offset of inserted term */
){
assert( d->iType>=DL_POSITIONS );
addPos(d, iColumn, iPos);
if( d->iType==DL_POSITIONS_OFFSETS ){
assert( iStartOffset>=d->iLastOffset );
appendVarint(d, iStartOffset-d->iLastOffset);
d->iLastOffset = iStartOffset;
assert( iEndOffset>=iStartOffset );
appendVarint(d, iEndOffset-iStartOffset);
}
appendVarint(d, POS_END); /* add new terminator */
}
/*
** A DocListReader object is a cursor into a doclist. Initialize
** the cursor to the beginning of the doclist by calling readerInit().
** Then use routines
**
** peekDocid()
** readDocid()
** readPosition()
** skipPositionList()
** and so forth...
**
** to read information out of the doclist. When we reach the end
** of the doclist, atEnd() returns TRUE.
*/
typedef struct DocListReader {
DocList *pDoclist; /* The document list we are stepping through */
char *p; /* Pointer to next unread byte in the doclist */
int iLastColumn;
int iLastPos; /* the last position read, or -1 when not in a position list */
} DocListReader;
/*
** Initialize the DocListReader r to point to the beginning of pDoclist.
*/
static void readerInit(DocListReader *r, DocList *pDoclist){
r->pDoclist = pDoclist;
if( pDoclist!=NULL ){
r->p = pDoclist->pData;
}
r->iLastColumn = -1;
r->iLastPos = -1;
}
/*
** Return TRUE if we have reached then end of pReader and there is
** nothing else left to read.
*/
static int atEnd(DocListReader *pReader){
return pReader->pDoclist==0 || (pReader->p >= docListEnd(pReader->pDoclist));
}
/* Peek at the next docid without advancing the read pointer.
*/
static sqlite_int64 peekDocid(DocListReader *pReader){
sqlite_int64 ret;
assert( !atEnd(pReader) );
assert( pReader->iLastPos==-1 );
getVarint(pReader->p, &ret);
return ret;
}
/* Read the next docid. See also nextDocid().
*/
static sqlite_int64 readDocid(DocListReader *pReader){
sqlite_int64 ret;
assert( !atEnd(pReader) );
assert( pReader->iLastPos==-1 );
pReader->p += getVarint(pReader->p, &ret);
if( pReader->pDoclist->iType>=DL_POSITIONS ){
pReader->iLastColumn = 0;
pReader->iLastPos = 0;
}
return ret;
}
/* Read the next position and column index from a position list.
* Returns the position, or -1 at the end of the list. */
static int readPosition(DocListReader *pReader, int *iColumn){
int i;
int iType = pReader->pDoclist->iType;
if( pReader->iLastPos==-1 ){
return -1;
}
assert( !atEnd(pReader) );
if( iType<DL_POSITIONS ){
return -1;
}
pReader->p += getVarint32(pReader->p, &i);
if( i==POS_END ){
pReader->iLastColumn = pReader->iLastPos = -1;
*iColumn = -1;
return -1;
}
if( i==POS_COLUMN ){
pReader->p += getVarint32(pReader->p, &pReader->iLastColumn);
pReader->iLastPos = 0;
pReader->p += getVarint32(pReader->p, &i);
assert( i>=POS_BASE );
}
pReader->iLastPos += ((int) i)-POS_BASE;
if( iType>=DL_POSITIONS_OFFSETS ){
/* Skip over offsets, ignoring them for now. */
int iStart, iEnd;
pReader->p += getVarint32(pReader->p, &iStart);
pReader->p += getVarint32(pReader->p, &iEnd);
}
*iColumn = pReader->iLastColumn;
return pReader->iLastPos;
}
/* Skip past the end of a position list. */
static void skipPositionList(DocListReader *pReader){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -