📄 fulltext.c
字号:
/* The author disclaims copyright to this source code. * * This is an SQLite module implementing full-text search. */#include <assert.h>#if !defined(__APPLE__)#include <malloc.h>#else#include <stdlib.h>#endif#include <stdio.h>#include <string.h>#include <ctype.h>#include "fulltext.h"#include "ft_hash.h"#include "tokenizer.h"#include "sqlite3.h"#include "sqlite3ext.h"SQLITE_EXTENSION_INIT1/* utility functions *//* We encode variable-length integers in little-endian order using seven bits * per byte as follows:**** KEY:** A = 0xxxxxxx 7 bits of data and one flag bit** B = 1xxxxxxx 7 bits of data and one flag bit**** 7 bits - A** 14 bits - BA** 21 bits - BBA** and so on.*//* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */#define VARINT_MAX 10/* Write a 64-bit variable-length integer to memory starting at p[0]. * The length of data written will be between 1 and VARINT_MAX bytes. * The number of bytes written is returned. */static int putVarint(char *p, sqlite_int64 v){ unsigned char *q = (unsigned char *) p; sqlite_uint64 vu = v; do{ *q++ = (unsigned char) ((vu & 0x7f) | 0x80); vu >>= 7; }while( vu!=0 ); q[-1] &= 0x7f; /* turn off high bit in final byte */ assert( q - (unsigned char *)p <= VARINT_MAX ); return (int) (q - (unsigned char *)p);}/* Read a 64-bit variable-length integer from memory starting at p[0]. * Return the number of bytes read, or 0 on error. * The value is stored in *v. */static int getVarint(const char *p, sqlite_int64 *v){ const unsigned char *q = (const unsigned char *) p; sqlite_uint64 x = 0, y = 1; while( (*q & 0x80) == 0x80 ){ x += y * (*q++ & 0x7f); y <<= 7; if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ assert( 0 ); return 0; } } x += y * (*q++); *v = (sqlite_int64) x; return (int) (q - (unsigned char *)p);}static int getVarint32(const char *p, int *pi){ sqlite_int64 i; int ret = getVarint(p, &i); *pi = (int) i; assert( *pi==i ); return ret;}/*** Document lists *** * * A document list holds a sorted list of varint-encoded document IDs. * * A doclist with type DL_POSITIONS_OFFSETS is stored like this: * * array { * varint docid; * array { * varint position; (delta from previous position plus 1, or 0 for end) * varint startOffset; (delta from previous startOffset) * varint endOffset; (delta from startOffset) * } * } * * Here, array { X } means zero or more occurrences of X, adjacent in memory. * * A doclist with type DL_POSITIONS is like the above, but holds only docids * and positions without offset information. * * A doclist with type DL_DOCIDS is like the above, but holds only docids * without positions or offset information. * * On disk, every document list has positions and offsets, so we don't bother * to serialize a doclist's type. * * We don't yet delta-encode document IDs; doing so will probably be a * modest win. * * NOTE(shess) I've thought of a slightly (1%) better offset encoding. * After the first offset, estimate the next offset by using the * current token position and the previous token position and offset, * offset to handle some variance. So the estimate would be * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded * as normal. Offsets more than 64 chars from the estimate are * encoded as the delta to the previous start offset + 128. An * additional tiny increment can be gained by using the end offset of * the previous token to make the estimate a tiny bit more precise.*/typedef enum DocListType { DL_DOCIDS, /* docids only */ DL_POSITIONS, /* docids + positions */ DL_POSITIONS_OFFSETS /* docids + positions + offsets */} DocListType;typedef struct DocList { char *pData; int nData; DocListType iType; int iLastPos; /* the last position written */ int iLastOffset; /* the last start offset written */} DocList;/* Initialize a new DocList to hold the given data. */static void docListInit(DocList *d, DocListType iType, const char *pData, int nData){ d->nData = nData; if( nData>0 ){ d->pData = malloc(nData); memcpy(d->pData, pData, nData); } else { d->pData = NULL; } d->iType = iType; d->iLastPos = 0; d->iLastOffset = 0;}/* Create a new dynamically-allocated DocList. */static DocList *docListNew(DocListType iType){ DocList *d = (DocList *) malloc(sizeof(DocList)); docListInit(d, iType, 0, 0); return d;}static void docListDestroy(DocList *d){ free(d->pData);#ifndef NDEBUG memset(d, 0x55, sizeof(*d));#endif}static void docListDelete(DocList *d){ docListDestroy(d); free(d);}static char *docListEnd(DocList *d){ return d->pData + d->nData;}/* Append a varint to a DocList's data. */static void appendVarint(DocList *d, sqlite_int64 i){ char c[VARINT_MAX]; int n = putVarint(c, i); d->pData = realloc(d->pData, d->nData + n); memcpy(d->pData + d->nData, c, n); d->nData += n;}static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ appendVarint(d, iDocid); d->iLastPos = 0;}/* Add a position to the last position list in a doclist. */static void docListAddPos(DocList *d, int iPos){ assert( d->iType>=DL_POSITIONS ); appendVarint(d, iPos-d->iLastPos+1); d->iLastPos = iPos;}static void docListAddPosOffset(DocList *d, int iPos, int iStartOffset, int iEndOffset){ assert( d->iType==DL_POSITIONS_OFFSETS ); docListAddPos(d, iPos); appendVarint(d, iStartOffset-d->iLastOffset); d->iLastOffset = iStartOffset; appendVarint(d, iEndOffset-iStartOffset);}/* Terminate the last position list in the given doclist. */static void docListAddEndPos(DocList *d){ appendVarint(d, 0);}typedef struct DocListReader { DocList *pDoclist; char *p; int iLastPos; /* the last position read */} DocListReader;static void readerInit(DocListReader *r, DocList *pDoclist){ r->pDoclist = pDoclist; if( pDoclist!=NULL ){ r->p = pDoclist->pData; } r->iLastPos = 0;}static int readerAtEnd(DocListReader *pReader){ return pReader->p >= docListEnd(pReader->pDoclist);}/* Peek at the next docid without advancing the read pointer. */static sqlite_int64 peekDocid(DocListReader *pReader){ sqlite_int64 ret; assert( !readerAtEnd(pReader) ); getVarint(pReader->p, &ret); return ret;}/* Read the next docid. */static sqlite_int64 readDocid(DocListReader *pReader){ sqlite_int64 ret; assert( !readerAtEnd(pReader) ); pReader->p += getVarint(pReader->p, &ret); pReader->iLastPos = 0; return ret;}/* Read the next position from a position list. * Returns the position, or -1 at the end of the list. */static int readPosition(DocListReader *pReader){ int i; int iType = pReader->pDoclist->iType; assert( iType>=DL_POSITIONS ); assert( !readerAtEnd(pReader) ); pReader->p += getVarint32(pReader->p, &i); if( i==0 ){ pReader->iLastPos = -1; return -1; } pReader->iLastPos += ((int) i)-1; if( iType>=DL_POSITIONS_OFFSETS ){ /* Skip over offsets, ignoring them for now. */ int iStart, iEnd; pReader->p += getVarint32(pReader->p, &iStart); pReader->p += getVarint32(pReader->p, &iEnd); } return pReader->iLastPos;}/* Skip past the end of a position list. */static void skipPositionList(DocListReader *pReader){ while( readPosition(pReader)!=-1 ) ;}/* Skip over a docid, including its position list if the doclist has * positions. */static void skipDocument(DocListReader *pReader){ readDocid(pReader); if( pReader->pDoclist->iType >= DL_POSITIONS ){ skipPositionList(pReader); }}static sqlite_int64 firstDocid(DocList *d){ DocListReader r; readerInit(&r, d); return readDocid(&r);}/* Doclist multi-tool. Pass pUpdate==NULL to delete the indicated docid; * otherwise pUpdate, which must contain only the single docid [iDocid], is * inserted (if not present) or updated (if already present). */static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){ int modified = 0; DocListReader reader; char *p; if( pUpdate!=NULL ){ assert( d->iType==pUpdate->iType); assert( iDocid==firstDocid(pUpdate) ); } readerInit(&reader, d); while( !readerAtEnd(&reader) && peekDocid(&reader)<iDocid ){ skipDocument(&reader); } p = reader.p; /* Delete if there is a matching element. */ if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){ skipDocument(&reader); memmove(p, reader.p, docListEnd(d) - reader.p); d->nData -= (reader.p - p); modified = 1; } /* Insert if indicated. */ if( pUpdate!=NULL ){ int iDoclist = p-d->pData; docListAddEndPos(pUpdate); d->pData = realloc(d->pData, d->nData+pUpdate->nData); p = d->pData + iDoclist; memmove(p+pUpdate->nData, p, docListEnd(d) - p); memcpy(p, pUpdate->pData, pUpdate->nData); d->nData += pUpdate->nData; modified = 1; } return modified;}/* Split the second half of doclist d into a separate doclist d2. Returns 1 * if successful, or 0 if d contains a single document and hence can't be * split. */static int docListSplit(DocList *d, DocList *d2){ const char *pSplitPoint = d->pData + d->nData / 2; DocListReader reader; readerInit(&reader, d); while( reader.p<pSplitPoint ){ skipDocument(&reader); } if( readerAtEnd(&reader) ) return 0; docListInit(d2, d->iType, reader.p, docListEnd(d) - reader.p); d->nData = reader.p - d->pData; d->pData = realloc(d->pData, d->nData); return 1;}/* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked * on-disk doclist, resulting in another in-memory DocList [out]. [in] * and [out] may or may not store position information according to the * caller's wishes. The on-disk doclist always comes with positions. * * The caller must read each chunk of the on-disk doclist in succession and * pass it to mergeBlock(). * * If [in] has positions, then the merge output contains only documents with * matching positions in the two input doclists. If [in] does not have * positions, then the merge output contains all documents common to the two * input doclists. * * If [in] is NULL, then the on-disk doclist is copied to [out] directly. * * A merge is performed using an integer [iOffset] provided by the caller. * [iOffset] is subtracted from each position in the on-disk doclist for the * purpose of position comparison; this is helpful in implementing phrase * searches. * * A DocListMerge is not yet able to propagate offsets through query * processing; we should add that capability soon.*/typedef struct DocListMerge { DocListReader in; DocList *pOut; int iOffset;} DocListMerge;static void mergeInit(DocListMerge *m, DocList *pIn, int iOffset, DocList *pOut){ readerInit(&m->in, pIn); m->pOut = pOut; m->iOffset = iOffset; /* can't handle offsets yet */ assert( pIn==NULL || pIn->iType <= DL_POSITIONS ); assert( pOut->iType <= DL_POSITIONS );}/* A helper function for mergeBlock(), below. Merge the position lists * pointed to by m->in and pBlockReader. * If the merge matches, write [iDocid] to m->pOut; if m->pOut * has positions then write all matching positions as well. */static void mergePosList(DocListMerge *m, sqlite_int64 iDocid, DocListReader *pBlockReader){ int block_pos = readPosition(pBlockReader); int in_pos = readPosition(&m->in); int match = 0; while( block_pos!=-1 || in_pos!=-1 ){ if( block_pos-m->iOffset==in_pos ){ if( !match ){ docListAddDocid(m->pOut, iDocid); match = 1; } if( m->pOut->iType >= DL_POSITIONS ){ docListAddPos(m->pOut, in_pos); } block_pos = readPosition(pBlockReader); in_pos = readPosition(&m->in); } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffset<in_pos) ){ block_pos = readPosition(pBlockReader); } else { in_pos = readPosition(&m->in); } } if( m->pOut->iType >= DL_POSITIONS && match ){ docListAddEndPos(m->pOut); }}/* Merge one block of an on-disk doclist into a DocListMerge. */static void mergeBlock(DocListMerge *m, DocList *pBlock){ DocListReader blockReader; assert( pBlock->iType >= DL_POSITIONS ); readerInit(&blockReader, pBlock); while( !readerAtEnd(&blockReader) ){ sqlite_int64 iDocid = readDocid(&blockReader); if( m->in.pDoclist!=NULL ){ while( 1 ){ if( readerAtEnd(&m->in) ) return; /* nothing more to merge */ if( peekDocid(&m->in)>=iDocid ) break; skipDocument(&m->in); } if( peekDocid(&m->in)>iDocid ){ /* [pIn] has no match with iDocid */ skipPositionList(&blockReader); /* skip this docid in the block */ continue; } readDocid(&m->in); } /* We have a document match. */ if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){ /* We don't need to do a poslist merge. */ docListAddDocid(m->pOut, iDocid); if( m->pOut->iType >= DL_POSITIONS ){ /* Copy all positions to the output doclist. */ while( 1 ){ int pos = readPosition(&blockReader); if( pos==-1 ) break; docListAddPos(m->pOut, pos); } docListAddEndPos(m->pOut); } else skipPositionList(&blockReader); continue; } mergePosList(m, iDocid, &blockReader); }}static char *string_dup_n(const char *s, int n){ char *str = malloc(n + 1); memcpy(str, s, n); str[n] = '\0'; return str;}/* Duplicate a string; the caller must free() the returned string. * (We don't use strdup() since it's not part of the standard C library and * may not be available everywhere.) */static char *string_dup(const char *s){ return string_dup_n(s, strlen(s));}/* Format a string, replacing each occurrence of the % character with * zName. This may be more convenient than sqlite_mprintf() * when one string is used repeatedly in a format string. * The caller must free() the returned string. */static char *string_format(const char *zFormat, const char *zName){ const char *p; size_t len = 0; size_t nName = strlen(zName); char *result; char *r; /* first compute length needed */ for(p = zFormat ; *p ; ++p){ len += (*p=='%' ? nName : 1); } len += 1; /* for null terminator */ r = result = malloc(len); for(p = zFormat; *p; ++p){ if( *p=='%' ){ memcpy(r, zName, nName); r += nName; } else { *r++ = *p; } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -