⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segmenttermenum.cpp

📁 clucene是c++版的全文检索引擎,完全移植于lucene,采用 stl 编写.
💻 CPP
字号:
/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"
#include "SegmentTermEnum.h"

#include "Terms.h"
#include "FieldInfos.h"
#include "Term.h"
#include "TermInfo.h"
#include "TermInfosWriter.h"

CL_NS_USE(store)
CL_NS_DEF(index)

	SegmentTermEnum::SegmentTermEnum(IndexInput* i, FieldInfos* fis, const bool isi):
		fieldInfos(fis){
	//Func - Constructor
	//Pre  - i holds a reference to an instance of IndexInput
	//       fis holds a reference to an instance of FieldInfos
	//       isi
	//Post - An instance of SegmentTermEnum has been created
		input		 = i;
		position     = -1;
		//Instantiate a Term with empty field, empty text and which is interned (see term.h what interned means)
	    _term         = _CLNEW Term( LUCENE_BLANK_STRING,LUCENE_BLANK_STRING );
		isIndex      = isi;
		termInfo     = _CLNEW TermInfo();
		indexPointer = 0;
		buffer       = NULL;
		bufferLength = 0;
		prev         = NULL;
		formatM1SkipInterval = 0;

		//Set isClone to false as the instance is not clone of another instance
		isClone      = false;


		int32_t firstInt = input->readInt();
      if (firstInt >= 0) {
         // original-format file, without explicit format version number
         format = 0;
         size = firstInt;

         // back-compatible settings
         indexInterval = 128;
         skipInterval = LUCENE_INT32_MAX_SHOULDBE; // switch off skipTo optimization

      } else {
         // we have a format version number
         format = firstInt;

         // check that it is a format we can understand
         if (format < TermInfosWriter::FORMAT){
            TCHAR err[30];
            _sntprintf(err,30,_T("Unknown format version: %d"), format);
            _CLTHROWT(CL_ERR_Runtime,err);
         }

         size = input->readLong();                    // read the size
         
         if(format == -1){
            if (!isIndex) {
               indexInterval = input->readInt();
               formatM1SkipInterval = input->readInt();
            }
            // switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in 
            // skipTo implementation of these versions
            skipInterval = LUCENE_INT32_MAX_SHOULDBE;
         }else{
            indexInterval = input->readInt();
            skipInterval = input->readInt();
         }
      }
	}

	SegmentTermEnum::SegmentTermEnum(const SegmentTermEnum& clone):
		fieldInfos(clone.fieldInfos)
	{
	//Func - Constructor
	//       The instance is created by cloning all properties of clone
	//Pre  - clone holds a valid reference to SegmentTermEnum
	//Post - An instance of SegmentTermEnum with the same properties as clone
		
		input		 = clone.input->clone();
		//Copy the postion from the clone
		position     = clone.position;

		_term         = clone._term==NULL?NULL:_CLNEW Term(clone._term->field(),clone._term->text());
		isIndex      = clone.isIndex;
		termInfo     = _CLNEW TermInfo(clone.termInfo);
		indexPointer = clone.indexPointer;
		buffer       = clone.buffer==NULL?NULL:(TCHAR*)malloc(sizeof(TCHAR) * (clone.bufferLength+1));
		bufferLength = clone.bufferLength;
		prev         = clone.prev==NULL?NULL:_CLNEW Term(clone.prev->field(),clone.prev->text());
		size         = clone.size;

      format       = clone.format;
      indexInterval= clone.indexInterval;
      skipInterval = clone.skipInterval;
      formatM1SkipInterval = clone.formatM1SkipInterval;
		//Set isClone to true as this instance is a clone of another instance
		isClone      = true;

		//Copy the contents of buffer of clone to the buffer of this instance
		if ( clone.buffer != NULL )
			memcpy(buffer,clone.buffer,bufferLength * sizeof(TCHAR));
	}

	SegmentTermEnum::~SegmentTermEnum(){
	//Func - Destructor
	//Pre  - true
	//Post - The instance has been destroyed. If this instance was a clone
	//       then the inputstream is closed and deleted too.

        //todo: revisit this... close() should clean up most of everything.

		//Finalize prev
		_CLDECDELETE(prev );
		//Finalize term
		_CLDECDELETE( _term );
		

		//Delete the buffer if necessary
		free(buffer);
		//Delete termInfo if necessary
		_CLDELETE(termInfo);

		//Check if this instance is a clone
		if ( isClone ){
			//Close the inputstream
			input->close();
			//delete the inputstream
			_CLDELETE(input);
			}
	}

	bool SegmentTermEnum::next(){
	//Func - Moves the current of the set to the next in the set
	//Pre  - true
	//Post - If the end has been reached NULL is returned otherwise the term has
	//       become the next Term in the enumeration

		//Increase position by and and check if the end has been reached
		if (position++ >= size-1) {
			//delete term
			_CLDECDELETE(_term);
			return false;
		}

		//delete the previous enumerated term
		Term* tmp=NULL;
		if ( prev != NULL ){
			int32_t usage = prev->__cl_refcount;
			if ( usage > 1 ){
				_CLDECDELETE(prev); //todo: tune other places try and delete its term 
			}else
				tmp = prev; //we are going to re-use this term
		}
		//prev becomes the current enumerated term
		prev = _term;
		//term becomes the next term read from inputStream input
		_term = readTerm(tmp);

		//Read docFreq, the number of documents which contain the term.
		termInfo->docFreq = input->readVInt();
		//Read freqPointer, a pointer into the TermFreqs file (.frq)
		termInfo->freqPointer += input->readVLong();
		
		//Read proxPointer, a pointer into the TermPosition file (.prx).
		termInfo->proxPointer += input->readVLong();

      if(format == -1){
         //  just read skipOffset in order to increment  file pointer; 
         // value is never used since skipTo is switched off
         if (!isIndex) {
            if (termInfo->docFreq > formatM1SkipInterval) {
               termInfo->skipOffset = input->readVInt(); 
            }
         }
      }else{
         if (termInfo->docFreq >= skipInterval) 
            termInfo->skipOffset = input->readVInt();
      }

		//Check if the enumeration is an index
		if (isIndex)
			//read index pointer
			indexPointer += input->readVLong();

		return true;
	}

	Term* SegmentTermEnum::term() {
	//Func - Returns the current term.
	//Pre  - pointer is true or false and indicates if the reference counter
	//       of term must be increased or not
	//       next() must have been called once!
	//Post - pointer = true -> term has been returned with an increased reference counter
	//       pointer = false -> term has been returned

		return _CL_POINTER(_term);
	}
	Term* SegmentTermEnum::term(bool pointer) {
		if ( pointer )
			return _CL_POINTER(_term);
		else
			return _term;
	}

	void SegmentTermEnum::scanTo(const Term *term){
	//Func - Scan for Term without allocating new Terms
	//Pre  - term != NULL
	//Post - The iterator term has been moved to the position where Term is expected to be
	//       in the enumeration
		while ( term->compareTo(this->_term) > 0 && next()) 
		{
		}
	}

	void SegmentTermEnum::close() {
	//Func - Closes the enumeration to further activity, freeing resources.
	//Pre  - true
	//Post - The inputStream input has been closed

			input->close();
	}

	int32_t SegmentTermEnum::docFreq() const {
	//Func - Returns the document frequency of the current term in the set
	//Pre  - termInfo != NULL
	//       next() must have been called once
	//Post  - The document frequency of the current enumerated term has been returned

		return termInfo->docFreq;
	}

	void SegmentTermEnum::seek(const int64_t pointer, const int32_t p, Term* t, TermInfo* ti) {
	//Func - Repositions term and termInfo within the enumeration
	//Pre  - pointer >= 0
	//       p >= 0 and contains the new position within the enumeration
	//       t is a valid reference to a Term and is the new current term in the enumeration
	//       ti is a valid reference to a TermInfo and is corresponding TermInfo form the new
	//       current Term
	//Post - term and terminfo have been repositioned within the enumeration

		//Reset the IndexInput input to pointer
		input->seek(pointer);
		//Assign the new position
		position = p;

		//finalize the current term
		if ( _term == NULL || _term->__cl_refcount > 1 ){
			_CLDECDELETE(_term);
			//Get a pointer from t and increase the reference counter of t
			_term = _CLNEW Term(t->field(),t->text()); //cannot use reference, because TermInfosReader uses non ref-counted array
		}else{
			_term->set(t->field(),t->text());
		}
		//finalize prev
		_CLDECDELETE(prev);

		//Change the current termInfo so it matches the new current term
		termInfo->set(ti);

		//Have the buffer grown if needed
		if ( bufferLength <= _term->textLength() )
			growBuffer(_term->textLength() );		  // copy term text into buffer
		else
			_tcsncpy(buffer,_term->text(),bufferLength); //just copy the buffer
	}

	TermInfo* SegmentTermEnum::getTermInfo()const {
	//Func - Returns a clone of the current termInfo
	//Pre  - termInfo != NULL
	//       next() must have been called once
	//Post - A clone of the current termInfo has been returned

		return _CLNEW TermInfo(*termInfo); //clone
	}

	void SegmentTermEnum::getTermInfo(TermInfo* ti)const {
	//Func - Retrieves a clone of termInfo through the reference ti
	//Pre  - ti contains a valid reference to TermInfo
	//       termInfo != NULL
	//       next() must have been called once
	//Post - ti contains a clone of termInfo

		ti->set(termInfo);
	}

	int64_t SegmentTermEnum::freqPointer()const {
	//Func - Returns the freqpointer of the current termInfo
	//Pre  - termInfo != NULL
	//       next() must have been called once
	//Post - The freqpointer of the current termInfo has been returned

		return termInfo->freqPointer;
	}

	int64_t SegmentTermEnum::proxPointer()const {
	//Func - Returns the proxPointer of the current termInfo
	//Pre  - termInfo != NULL
	//       next() must have been called once
	//Post - the proxPointer of the current termInfo has been returned

		return termInfo->proxPointer;
	}

	SegmentTermEnum* SegmentTermEnum::clone() const {
	//Func - Returns a clone of this instance
	//Pre  - true
	//Post - An clone of this instance has been returned

		return _CLNEW SegmentTermEnum(*this);
	}

	Term* SegmentTermEnum::readTerm(Term* reuse) {
	//Func - Reads the next term in the enumeration
	//Pre  - true
	//Post - The next Term in the enumeration has been read and returned

		//Read the start position from the inputStream input
		int32_t start = input->readVInt();
		//Read the length of term in the inputStream input
		int32_t length = input->readVInt();

		//Calculated the total lenght of bytes that buffer must be to contain the current
		//chars in buffer and the new ones yet to be read
		uint32_t totalLength = start + length;

		//TODO: check this, not copying buffer every time.
		if (static_cast<uint32_t>(bufferLength) < totalLength+1)
			growBuffer(totalLength);

		//Read a length number of characters into the buffer from position start in the inputStream input
		input->readChars(buffer, start, length);
		//Null terminate the string
		buffer[totalLength] = 0;

		//Return a new Term	
		int32_t field = input->readVInt();
		const TCHAR* fieldname = fieldInfos->fieldName(field);
		if ( reuse != NULL ){
			reuse->set(fieldname, buffer);
			return reuse;
		}else
			return _CLNEW Term( fieldname, buffer );
	}

	void SegmentTermEnum::growBuffer(const uint32_t length) {
	//Func - Instantiate a buffer of length length+1
	//Pre  - length > 0
	//Post - pre(buffer) has been deleted with its contents. A new buffer
	//       has been allocated of length length+1 and the text of term has been copied
	//       to buffer
		//todo: we could guess that we will need to re-grow this
		//buffer a few times...so start off with a reasonable grow
		//value...
		if ( bufferLength > length )
			return;

        //Store the new bufferLength
		if ( length - bufferLength < LUCENE_SEGMENTTERMENUM_GROWSIZE )
			bufferLength = length+LUCENE_SEGMENTTERMENUM_GROWSIZE;
		else
			bufferLength = length+1;

		bool copy = buffer==NULL;

		//Instantiate the new buffer + 1 is needed for terminator '\0'
		if ( buffer == NULL )
			buffer = (TCHAR*)malloc(sizeof(TCHAR) * (bufferLength+1));
		else
			buffer = (TCHAR*)realloc(buffer, sizeof(TCHAR) * (bufferLength+1));

		if ( copy ){
			//Copy the text of term into buffer
			_tcsncpy(buffer,_term->text(),bufferLength);
		}
	}

CL_NS_END

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -