📄 fts2.c
字号:
static void dlwCopy(DLWriter *pWriter, DLReader *pReader){ dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader), dlrDocid(pReader), dlrDocid(pReader));}static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){ char c[VARINT_MAX]; int n = putVarint(c, iDocid-pWriter->iPrevDocid); /* Docids must ascend. */ assert( !pWriter->has_iPrevDocid || iDocid>pWriter->iPrevDocid ); assert( pWriter->iType==DL_DOCIDS ); dataBufferAppend(pWriter->b, c, n); pWriter->iPrevDocid = iDocid;#ifndef NDEBUG pWriter->has_iPrevDocid = 1;#endif}/*******************************************************************//* PLReader is used to read data from a document's position list. As** the caller steps through the list, data is cached so that varints** only need to be decoded once.**** plrInit, plrDestroy - create/destroy a reader.** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors** plrAtEnd - at end of stream, only call plrDestroy once true.** plrStep - step to the next element.*/typedef struct PLReader { /* These refer to the next position's data. nData will reach 0 when ** reading the last position, so plrStep() signals EOF by setting ** pData to NULL. */ const char *pData; int nData; DocListType iType; int iColumn; /* the last column read */ int iPosition; /* the last position read */ int iStartOffset; /* the last start offset read */ int iEndOffset; /* the last end offset read */} PLReader;static int plrAtEnd(PLReader *pReader){ return pReader->pData==NULL;}static int plrColumn(PLReader *pReader){ assert( !plrAtEnd(pReader) ); return pReader->iColumn;}static int plrPosition(PLReader *pReader){ assert( !plrAtEnd(pReader) ); return pReader->iPosition;}static int plrStartOffset(PLReader *pReader){ assert( !plrAtEnd(pReader) ); return pReader->iStartOffset;}static int plrEndOffset(PLReader *pReader){ assert( !plrAtEnd(pReader) ); return pReader->iEndOffset;}static void plrStep(PLReader *pReader){ int i, n; assert( !plrAtEnd(pReader) ); if( pReader->nData==0 ){ pReader->pData = NULL; return; } n = getVarint32(pReader->pData, &i); if( i==POS_COLUMN ){ n += getVarint32(pReader->pData+n, &pReader->iColumn); pReader->iPosition = 0; pReader->iStartOffset = 0; n += getVarint32(pReader->pData+n, &i); } /* Should never see adjacent column changes. */ assert( i!=POS_COLUMN ); if( i==POS_END ){ pReader->nData = 0; pReader->pData = NULL; return; } pReader->iPosition += i-POS_BASE; if( pReader->iType==DL_POSITIONS_OFFSETS ){ n += getVarint32(pReader->pData+n, &i); pReader->iStartOffset += i; n += getVarint32(pReader->pData+n, &i); pReader->iEndOffset = pReader->iStartOffset+i; } assert( n<=pReader->nData ); pReader->pData += n; pReader->nData -= n;}static void plrInit(PLReader *pReader, DLReader *pDLReader){ pReader->pData = dlrPosData(pDLReader); pReader->nData = dlrPosDataLen(pDLReader); pReader->iType = pDLReader->iType; pReader->iColumn = 0; pReader->iPosition = 0; pReader->iStartOffset = 0; pReader->iEndOffset = 0; plrStep(pReader);}static void plrDestroy(PLReader *pReader){ SCRAMBLE(pReader);}/*******************************************************************//* PLWriter is used in constructing a document's position list. As a** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.** PLWriter writes to the associated DLWriter's buffer.**** plwInit - init for writing a document's poslist.** plwDestroy - clear a writer.** plwAdd - append position and offset information.** plwCopy - copy next position's data from reader to writer.** plwTerminate - add any necessary doclist terminator.**** Calling plwAdd() after plwTerminate() may result in a corrupt** doclist.*//* TODO(shess) Until we've written the second item, we can cache the** first item's information. Then we'd have three states:**** - initialized with docid, no positions.** - docid and one position.** - docid and multiple positions.**** Only the last state needs to actually write to dlw->b, which would** be an improvement in the DLCollector case.*/typedef struct PLWriter { DLWriter *dlw; int iColumn; /* the last column written */ int iPos; /* the last position written */ int iOffset; /* the last start offset written */} PLWriter;/* TODO(shess) In the case where the parent is reading these values** from a PLReader, we could optimize to a copy if that PLReader has** the same type as pWriter.*/static void plwAdd(PLWriter *pWriter, int iColumn, int iPos, int iStartOffset, int iEndOffset){ /* Worst-case space for POS_COLUMN, iColumn, iPosDelta, ** iStartOffsetDelta, and iEndOffsetDelta. */ char c[5*VARINT_MAX]; int n = 0; /* Ban plwAdd() after plwTerminate(). */ assert( pWriter->iPos!=-1 ); if( pWriter->dlw->iType==DL_DOCIDS ) return; if( iColumn!=pWriter->iColumn ){ n += putVarint(c+n, POS_COLUMN); n += putVarint(c+n, iColumn); pWriter->iColumn = iColumn; pWriter->iPos = 0; pWriter->iOffset = 0; } assert( iPos>=pWriter->iPos ); n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos)); pWriter->iPos = iPos; if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){ assert( iStartOffset>=pWriter->iOffset ); n += putVarint(c+n, iStartOffset-pWriter->iOffset); pWriter->iOffset = iStartOffset; assert( iEndOffset>=iStartOffset ); n += putVarint(c+n, iEndOffset-iStartOffset); } dataBufferAppend(pWriter->dlw->b, c, n);}static void plwCopy(PLWriter *pWriter, PLReader *pReader){ plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader), plrStartOffset(pReader), plrEndOffset(pReader));}static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){ char c[VARINT_MAX]; int n; pWriter->dlw = dlw; /* Docids must ascend. */ assert( !pWriter->dlw->has_iPrevDocid || iDocid>pWriter->dlw->iPrevDocid ); n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid); dataBufferAppend(pWriter->dlw->b, c, n); pWriter->dlw->iPrevDocid = iDocid;#ifndef NDEBUG pWriter->dlw->has_iPrevDocid = 1;#endif pWriter->iColumn = 0; pWriter->iPos = 0; pWriter->iOffset = 0;}/* TODO(shess) Should plwDestroy() also terminate the doclist? But** then plwDestroy() would no longer be just a destructor, it would** also be doing work, which isn't consistent with the overall idiom.** Another option would be for plwAdd() to always append any necessary** terminator, so that the output is always correct. But that would** add incremental work to the common case with the only benefit being** API elegance. Punt for now.*/static void plwTerminate(PLWriter *pWriter){ if( pWriter->dlw->iType>DL_DOCIDS ){ char c[VARINT_MAX]; int n = putVarint(c, POS_END); dataBufferAppend(pWriter->dlw->b, c, n); }#ifndef NDEBUG /* Mark as terminated for assert in plwAdd(). */ pWriter->iPos = -1;#endif}static void plwDestroy(PLWriter *pWriter){ SCRAMBLE(pWriter);}/*******************************************************************//* DLCollector wraps PLWriter and DLWriter to provide a** dynamically-allocated doclist area to use during tokenization.**** dlcNew - malloc up and initialize a collector.** dlcDelete - destroy a collector and all contained items.** dlcAddPos - append position and offset information.** dlcAddDoclist - add the collected doclist to the given buffer.** dlcNext - terminate the current document and open another.*/typedef struct DLCollector { DataBuffer b; DLWriter dlw; PLWriter plw;} DLCollector;/* TODO(shess) This could also be done by calling plwTerminate() and** dataBufferAppend(). I tried that, expecting nominal performance** differences, but it seemed to pretty reliably be worth 1% to code** it this way. I suspect it's the incremental malloc overhead (some** percentage of the plwTerminate() calls will cause a realloc), so** this might be worth revisiting if the DataBuffer implementation** changes.*/static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){ if( pCollector->dlw.iType>DL_DOCIDS ){ char c[VARINT_MAX]; int n = putVarint(c, POS_END); dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n); }else{ dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData); }}static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){ plwTerminate(&pCollector->plw); plwDestroy(&pCollector->plw); plwInit(&pCollector->plw, &pCollector->dlw, iDocid);}static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos, int iStartOffset, int iEndOffset){ plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset);}static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){ DLCollector *pCollector = malloc(sizeof(DLCollector)); dataBufferInit(&pCollector->b, 0); dlwInit(&pCollector->dlw, iType, &pCollector->b); plwInit(&pCollector->plw, &pCollector->dlw, iDocid); return pCollector;}static void dlcDelete(DLCollector *pCollector){ plwDestroy(&pCollector->plw); dlwDestroy(&pCollector->dlw); dataBufferDestroy(&pCollector->b); SCRAMBLE(pCollector); free(pCollector);}/* Copy the doclist data of iType in pData/nData into *out, trimming** unnecessary data as we go. Only columns matching iColumn are** copied, all columns copied if iColumn is -1. Elements with no** matching columns are dropped. The output is an iOutType doclist.*//* NOTE(shess) This code is only valid after all doclists are merged.** If this is run before merges, then doclist items which represent** deletion will be trimmed, and will thus not effect a deletion** during the merge.*/static void docListTrim(DocListType iType, const char *pData, int nData, int iColumn, DocListType iOutType, DataBuffer *out){ DLReader dlReader; DLWriter dlWriter; assert( iOutType<=iType ); dlrInit(&dlReader, iType, pData, nData); dlwInit(&dlWriter, iOutType, out); while( !dlrAtEnd(&dlReader) ){ PLReader plReader; PLWriter plWriter; int match = 0; plrInit(&plReader, &dlReader); while( !plrAtEnd(&plReader) ){ if( iColumn==-1 || plrColumn(&plReader)==iColumn ){ if( !match ){ plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader)); match = 1; } plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader), plrStartOffset(&plReader), plrEndOffset(&plReader)); } plrStep(&plReader); } if( match ){ plwTerminate(&plWriter); plwDestroy(&plWriter); } plrDestroy(&plReader); dlrStep(&dlReader); } dlwDestroy(&dlWriter); dlrDestroy(&dlReader);}/* Used by docListMerge() to keep doclists in the ascending order by** docid, then ascending order by age (so the newest comes first).*/typedef struct OrderedDLReader { DLReader *pReader; /* TODO(shess) If we assume that docListMerge pReaders is ordered by ** age (which we do), then we could use pReader comparisons to break ** ties. */ int idx;} OrderedDLReader;/* Order eof to end, then by docid asc, idx desc. */static int orderedDLReaderCmp(OrderedDLReader *r1, OrderedDLReader *r2){ if( dlrAtEnd(r1->pReader) ){ if( dlrAtEnd(r2->pReader) ) return 0; /* Both atEnd(). */ return 1; /* Only r1 atEnd(). */ } if( dlrAtEnd(r2->pReader) ) return -1; /* Only r2 atEnd(). */ if( dlrDocid(r1->pReader)<dlrDocid(r2->pReader) ) return -1; if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1; /* Descending on idx. */ return r2->idx-r1->idx;}/* Bubble p[0] to appropriate place in p[1..n-1]. Assumes that** p[1..n-1] is already sorted.*//* TODO(shess) Is this frequent enough to warrant a binary search?** Before implementing that, instrument the code to check. In most** current usage, I expect that p[0] will be less than p[1] a very** high proportion of the time.*/static void orderedDLReaderReorder(OrderedDLReader *p, int n){ while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){ OrderedDLReader tmp = p[0]; p[0] = p[1]; p[1] = tmp; n--; p++; }}/* Given an array of doclist readers, merge their doclist elements** into out in sorted order (by docid), dropping elements from older** readers when there is a duplicate docid. pReaders is assumed to be** ordered by age, oldest first.*//* TODO(shess) nReaders must be <= MERGE_COUNT. This should probably** be fixed.*/static void docListMerge(DataBuffer *out, DLReader *pReaders, int nReaders){ OrderedDLReader readers[MERGE_COUNT];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -