⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 termvectorwriter.cpp

📁 clucene是c++版的全文检索引擎,完全移植于lucene,采用 stl 编写.
💻 CPP
字号:
/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"
#include "TermVector.h"
#include "CLucene/util/Misc.h"

CL_NS_USE(util)
CL_NS_DEF(index)

 TermVectorsWriter::TermVectorsWriter(CL_NS(store)::Directory* directory, 
    const char* segment,FieldInfos* fieldInfos)
 {
    // Open files for TermVector storage
	char fbuf[CL_MAX_NAME];
	strcpy(fbuf,segment);
	char* fpbuf=fbuf+strlen(fbuf);

	strcpy(fpbuf,LUCENE_TVX_EXTENSION);
    tvx = directory->createOutput(fbuf);
    tvx->writeInt(FORMAT_VERSION);

	strcpy(fpbuf,LUCENE_TVD_EXTENSION);
    tvd = directory->createOutput(fbuf);
    tvd->writeInt(FORMAT_VERSION);
	
	strcpy(fpbuf,LUCENE_TVF_EXTENSION);
    tvf = directory->createOutput(fbuf);
    tvf->writeInt(FORMAT_VERSION);

    this->fieldInfos = fieldInfos;

	currentField = NULL;
	currentDocPointer = -1;
  }

  TermVectorsWriter::~TermVectorsWriter(){
      if ( tvx != NULL ){
          tvx->close();
        _CLDELETE(tvx);
      }
      if ( tvd != NULL ){
          tvd->close();
        _CLDELETE(tvd);
      }
      if ( tvf != NULL ){
          tvf->close();
        _CLDELETE(tvf);
      }
  }


  void TermVectorsWriter::openDocument() {
    closeDocument();

    currentDocPointer = tvd->getFilePointer();
  }


  void TermVectorsWriter::closeDocument(){
    if (isDocumentOpen()) {
      closeField();
      writeDoc();
      fields.clear();
      currentDocPointer = -1;
    }
  }


  bool TermVectorsWriter::isDocumentOpen() const{
    return currentDocPointer != -1;
  }


  void TermVectorsWriter::openField(const TCHAR* field) {
    if (!isDocumentOpen())
		_CLTHROWA(CL_ERR_InvalidState,"Cannot open field when no document is open.");

    closeField();
    currentField = _CLNEW TVField(fieldInfos->fieldNumber(field));
  }

  void TermVectorsWriter::closeField(){
    if (isFieldOpen()) {
      /* DEBUG */
      //System.out.println("closeField()");
      /* DEBUG */

      // save field and terms
      writeField();
      fields.push_back(currentField);
      terms.clear();
      currentField = NULL;
    }
  }

  bool TermVectorsWriter::isFieldOpen() const{
    return currentField != NULL;
  }

  void TermVectorsWriter::addTerm(const TCHAR* termText, int32_t freq) {
    if (!isDocumentOpen()) _CLTHROWA(CL_ERR_InvalidState,"Cannot add terms when document is not open");
    if (!isFieldOpen()) _CLTHROWA(CL_ERR_InvalidState,"Cannot add terms when field is not open");

    addTermInternal(termText, freq);
  }

  void TermVectorsWriter::addTermInternal(const TCHAR* termText, int32_t freq) {
    currentField->length += freq;
    TVTerm* term = _CLNEW TVTerm();
    term->setTermText(termText);
    term->freq = freq;
    terms.push_back(term);
  }


  void TermVectorsWriter::addVectors(TermFreqVector** vectors) {
    if (!isDocumentOpen()) _CLTHROWA(CL_ERR_InvalidState,"Cannot add term vectors when document is not open");
    if (isFieldOpen()) _CLTHROWA(CL_ERR_InvalidState,"Cannot add term vectors when field is open");

	int32_t i = 0;
	while ( vectors[i] != NULL ){
      addTermFreqVector(vectors[i]);
    }
  }

  void TermVectorsWriter::addTermFreqVector(TermFreqVector* v){
    if (!isDocumentOpen()) _CLTHROWA(CL_ERR_InvalidState,"Cannot add term vector when document is not open");
    if (isFieldOpen()) _CLTHROWA(CL_ERR_InvalidState,"Cannot add term vector when field is open");
    addTermFreqVectorInternal(v);
  }

  void TermVectorsWriter::addTermFreqVectorInternal(TermFreqVector* v){
    openField(v->getField());
    const TCHAR** terms = v->getTerms();
    const int32_t* freqs = v->getTermFrequencies();
    int32_t size = v->size();
    for (int32_t i = 0; i < size; i++) {
      addTermInternal(terms[i], freqs[i]);
    }
	_CLDELETE_ARRAY(terms);
    closeField();
  }

 
  
  
  void TermVectorsWriter::close() {
    try {
      closeDocument();

      // make an effort to close all streams we can but remember and re-throw
      // the first exception encountered in this process
	#define _DOTVWCLOSE(x) if (x != NULL){ \
		try { \
		  x->close(); _CLDELETE(x) \
        } catch (CLuceneError& e) { \
          if ( e.number() != CL_ERR_IO ) throw e; \
		  if (ikeep==0)ikeep=e.number(); \
          if (keep[0]==0) strcpy(keep,e.what()); \
		} catch (...) { \
			if (keep[0]==0) strcpy(keep,"Unknown error while closing " #x); \
		} \
	  }
	}_CLFINALLY( \
    char keep[200]; \
	  int32_t ikeep=0;
	  keep[0]=0; \
	  _DOTVWCLOSE(tvx); \
	  _DOTVWCLOSE(tvd); \
	  _DOTVWCLOSE(tvf); \
		if (keep[0] != 0 ) { \
			_CLTHROWA(ikeep,keep); \
		}
	);
  }

  

  void TermVectorsWriter::writeField()  {
    // remember where this field is written
    currentField->tvfPointer = tvf->getFilePointer();
    //System.out.println("Field Pointer: " + currentField.tvfPointer);
    int32_t size;

    tvf->writeVInt(size = terms.size());
    tvf->writeVInt(currentField->length - size);
    const TCHAR* lastTermText = LUCENE_BLANK_STRING;
	int32_t lastTermTextLen = 0;

    // write term ids and positions
    for (int32_t i = 0; i < size; i++) {
      TVTerm* term = (TVTerm*)terms[i];
      //tvf->writeString(term->termText);
	  int32_t start = CL_NS(util)::Misc::stringDifference(lastTermText, lastTermTextLen, 
		  term->getTermText(),term->getTermTextLen());
      int32_t length = term->getTermTextLen() - start;
      tvf->writeVInt(start);			  // write shared prefix length
      tvf->writeVInt(length);			  // write delta length
      tvf->writeChars(term->getTermText(), start, length);  // write delta chars
      tvf->writeVInt(term->freq);

      lastTermText = term->getTermText();
	  lastTermTextLen = term->getTermTextLen();
    }
  }




  void TermVectorsWriter::writeDoc()  {
    if (isFieldOpen()) _CLTHROWA(CL_ERR_InvalidState,"Field is still open while writing document");
    //System.out.println("Writing doc pointer: " + currentDocPointer);
    // write document index record
    tvx->writeLong(currentDocPointer);

    // write document data record
    int32_t size = fields.size();

    // write the number of fields
    tvd->writeVInt(size);

    // write field numbers
	{ //msvc6 scope fix
		for (int32_t i = 0; i < size; i++) {
		  TVField* field = (TVField*) fields[i];
		  tvd->writeVInt(field->number);
		}
	}

    // write field pointers
    int64_t lastFieldPointer = 0;
    { //msvc6 scope fix
		for (int32_t i = 0; i < size; i++) {
		  TVField* field = (TVField*) fields[i];
		  tvd->writeVLong(field->tvfPointer - lastFieldPointer);

		  lastFieldPointer = field->tvfPointer;
		}
	}
    //System.out.println("After writing doc pointer: " + tvx.getFilePointer());
  }


  const TCHAR* TermVectorsWriter::TVTerm::getTermText() const{
      return termText;
  }
    size_t TermVectorsWriter::TVTerm::getTermTextLen(){ 
		if (termTextLen==-1)
			termTextLen = _tcslen(termText);
		return termTextLen; 
	}
	void TermVectorsWriter::TVTerm::setTermText(const TCHAR* val){ 
        _CLDELETE_CARRAY(termText);
		termText = STRDUP_TtoT(val);
		termTextLen = -1;

	}
	TermVectorsWriter::TVTerm::TVTerm(): freq(0){ 
        termText=NULL;  
        termTextLen=-1;
    }
    TermVectorsWriter::TVTerm::~TVTerm(){ 
        _CLDELETE_CARRAY(termText)
    }

CL_NS_END

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -