⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segmentmerger.cpp

📁 clucene是c++版的全文检索引擎,完全移植于lucene,采用 stl 编写.
💻 CPP
📖 第 1 页 / 共 2 页
字号:
/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"
#include "SegmentMerger.h"

CL_NS_USE(util)
CL_NS_USE(document)
CL_NS_USE(store)
CL_NS_DEF(index)

   // File extensions of old-style index files
   const char* COMPOUND_EXTENSIONS="fnm\0" "frq\0" "prx\0" "fdx\0" "fdt\0" "tii\0" "tis\0";
   int COMPOUND_EXTENSIONS_LENGTH=7;

   const char* VECTOR_EXTENSIONS="tvx\0" "tvd\0" "tvf\0";
   int VECTOR_EXTENSIONS_LENGTH=3;

  SegmentMerger::SegmentMerger(Directory* dir, const char* name, const bool compoundFile): directory(dir){
  //Func - Constructor
  //Pre  - dir holds a valid reference to a Directory
  //       name != NULL
  //Post - Instance has been created

      CND_PRECONDITION(name != NULL, "name is NULL");

      freqOutput       = NULL;
      proxOutput       = NULL;
      termInfosWriter  = NULL;
      queue            = NULL;
      segment          = STRDUP_AtoA(name);
      fieldInfos       = NULL;
      useCompoundFile  = compoundFile;
      skipBuffer       = _CLNEW CL_NS(store)::RAMIndexOutput();

	  lastSkipDoc=0;
	  lastSkipFreqPointer=0;
	  lastSkipProxPointer=0;
	  skipInterval=0;
  }

  SegmentMerger::~SegmentMerger(){
  //Func - Destructor
  //Pre  - true
  //Post - The instance has been destroyed
      
	  //Clear the readers set
	  readers.clear();

	  //Delete field Infos
    _CLDELETE(fieldInfos);     
    //Close and destroy the IndexOutput to the Frequency File
    if (freqOutput != NULL){ 
		  freqOutput->close(); 
		  _CLDELETE(freqOutput); 
	  }
    //Close and destroy the IndexOutput to the Prox File
    if (proxOutput != NULL){
		  proxOutput->close(); 
		  _CLDELETE(proxOutput); 
    }
    //Close and destroy the termInfosWriter
    if (termInfosWriter != NULL){
		  termInfosWriter->close(); 
		  _CLDELETE(termInfosWriter); 
    }
    //Close and destroy the queue
    if (queue != NULL){
		  queue->close(); 
		  _CLDELETE(queue);
	  }
	  //close and destory the skipBuffer
	  if ( skipBuffer != NULL ){
			skipBuffer->close();
			_CLDELETE(skipBuffer);
	  }

	  _CLDELETE_CaARRAY(segment);
  }

  void SegmentMerger::add(SegmentReader* reader) {
  //Func - Adds a SegmentReader to the set of readers
  //Pre  - reader contains a valid reference to a SegmentReader
  //Post - The SegementReader reader has been added to the set of readers

      readers.push_back(reader);
  }

  IndexReader* SegmentMerger::segmentReader(const int32_t i) {
  //Func - Returns a reference to the i-th SegmentReader
  //Pre  - 0 <= i < readers.size()
  //Post - A reference to the i-th SegmentReader has been returned

	  CND_PRECONDITION(i >= 0, "i is a negative number");
      CND_PRECONDITION((size_t)i < readers.size(), "i is bigger than the number of SegmentReader instances");

	  //Retrieve the i-th SegmentReader
      SegmentReader* ret = readers[i];
      CND_CONDITION(ret != NULL,"No SegmentReader found");

      return ret;
  }

  int32_t SegmentMerger::merge() {
    int32_t value = mergeFields();
    mergeTerms();
    mergeNorms();

    if (fieldInfos->hasVectors())
      mergeVectors();

    if (useCompoundFile)
      createCompoundFile();

    return value;
  }

  void SegmentMerger::closeReaders(){
    for (uint32_t i = 0; i < readers.size(); i++) {  // close readers
      IndexReader* reader = readers[i];
      reader->close();
    }
  }

  void SegmentMerger::createCompoundFile(){
      char name[CL_MAX_PATH];
      _snprintf(name,CL_MAX_PATH,"%s.cfs",segment);
      CompoundFileWriter* cfsWriter = _CLNEW CompoundFileWriter(directory, name);


      char** files = _CL_NEWARRAY(char*, COMPOUND_EXTENSIONS_LENGTH  + VECTOR_EXTENSIONS_LENGTH + fieldInfos->size());
      int32_t fileslen = 0;

	  { //msvc6 scope fix
		  // Basic files
		  for (int32_t i = 0; i < COMPOUND_EXTENSIONS_LENGTH; i++) {
			 files[fileslen]=Misc::ajoin(segment,".",COMPOUND_EXTENSIONS+(i*4));
			 fileslen++;
		  }
	  }

	  { //msvc6 scope fix
		  // Field norm files
		  for (int32_t i = 0; i < fieldInfos->size(); i++) {
			 FieldInfo* fi = fieldInfos->fieldInfo(i);
			 if (fi->isIndexed) {
				TCHAR tbuf[10];
				char abuf[10];
				_i64tot(i,tbuf,10);
				STRCPY_TtoA(abuf,tbuf,10);

				files[fileslen] = Misc::ajoin(segment,".f",abuf);
				fileslen++;
			 }
		  }
	  }

      // Vector files
      if (fieldInfos->hasVectors()) {
         for (int32_t i = 0; i < VECTOR_EXTENSIONS_LENGTH; i++) {
            files[fileslen] = Misc::ajoin(segment, ".", VECTOR_EXTENSIONS+(i*4));
					  fileslen++;
         }
      }

	{ //msvc6 scope fix
		// Now merge all added files
		for ( int32_t i=0;i<fileslen;i++ ){
		  cfsWriter->addFile(files[i]);
		}
	}
    
    // Perform the merge
    cfsWriter->close();
	_CLDELETE(cfsWriter);
        
	{ //msvc6 scope fix
		// Now delete the source files
		for ( int32_t i=0;i<fileslen;i++ ){
		  directory->deleteFile(files[i]);
		  _CLDELETE_LCaARRAY(files[i]);
		}
	}

    _CLDELETE_ARRAY(files);
  }

  int32_t SegmentMerger::mergeFields() {
  //Func - Merge the fields of all segments 
  //Pre  - true
  //Post - The field infos and field values of all segments have been merged.
		
	  //Create a new FieldInfos
      fieldInfos = _CLNEW FieldInfos();		  // merge field names

      //Condition check to see if fieldInfos points to a valid instance
      CND_CONDITION(fieldInfos != NULL,"Memory allocation for fieldInfos failed");

	  SegmentReader* reader = NULL;

     int32_t docCount = 0;

     //Iterate through all readers
     for (uint32_t i = 0; i < readers.size(); i++){
          //get the i-th reader
          reader = readers[i];
          //Condition check to see if reader points to a valid instance
          CND_CONDITION(reader != NULL,"No SegmentReader found");
		    

		  TCHAR** tmp = NULL;

		  tmp = reader->getIndexedFieldNames(true);
          fieldInfos->add((const TCHAR**)tmp, true, true);
		  _CLDELETE_CARRAY_ALL(tmp);
		
		  tmp = reader->getIndexedFieldNames(false);
          fieldInfos->add((const TCHAR**)tmp, true, false);
          _CLDELETE_CARRAY_ALL(tmp);

		  tmp = reader->getFieldNames(false);
          fieldInfos->add((const TCHAR**)tmp, false, false);
		  _CLDELETE_CARRAY_ALL(tmp);
     }
		
	  //Create the filename of the new FieldInfos file
	  const char* buf = Misc::segmentname(segment,".fnm");
	  //Write the new FieldInfos file to the directory
      fieldInfos->write(directory, buf );
	  //Destroy the buffer of the filename
      _CLDELETE_CaARRAY(buf);
	    
	  // merge field values


	  //Instantiate Fieldswriter which will write in directory for the segment name segment
      //Using the new merged fieldInfos
      FieldsWriter* fieldsWriter = _CLNEW FieldsWriter(directory, segment, fieldInfos);
      
	  //Condition check to see if fieldsWriter points to a valid instance
      CND_CONDITION(fieldsWriter != NULL,"Memory allocation for fieldsWriter failed");

      try {  
          IndexReader* reader = NULL;
          //Iterate through all readers
			int32_t j,maxDoc;
			Document* doc;
          for (uint32_t i = 0; i < readers.size(); ++i) {
              //get the i-th reader
              reader = (SegmentReader*)readers[i];


			  //Condition check to see if reader points to a valid instance
              CND_CONDITION(reader != NULL, "No SegmentReader found");

			  //Get the total number documents including the documents that have been marked deleted
              maxDoc = reader->maxDoc();
                  
			  //Iterate through all the documents managed by the current reader
			  for (j = 0; j < maxDoc; ++j){
				  //Check if the j-th document has been deleted, if so skip it
				  if (!reader->isDeleted(j)){ 
					//Get the document
					doc = reader->document(j);
					//Add the document to the new FieldsWriter
					fieldsWriter->addDocument( doc );
					++docCount;
					//doc is not used anymore so have it deleted
					_CLDELETE(doc);
				  }
			  }
		  }
	  }_CLFINALLY(
		  //Close the fieldsWriter
          fieldsWriter->close();
	      //And have it deleted as it not used any more
          _CLDELETE( fieldsWriter );
      );

      return docCount;
  }

  void SegmentMerger::mergeVectors(){
    TermVectorsWriter* termVectorsWriter = 
      _CLNEW TermVectorsWriter(directory, segment, fieldInfos);

    try {
      for (uint32_t r = 0; r < readers.size(); r++) {
        IndexReader* reader = readers[r];
        int32_t maxDoc = reader->maxDoc();
        for (int32_t docNum = 0; docNum < maxDoc; docNum++) {
          // skip deleted docs
          if (reader->isDeleted(docNum)) {
            continue;
          }
          termVectorsWriter->openDocument();

          // get all term vectors
          TermFreqVector** sourceTermVector =
            reader->getTermFreqVectors(docNum);

          if (sourceTermVector != NULL) {
            int32_t f = 0;
            TermFreqVector* termVector=NULL;
            while ( (termVector=sourceTermVector[f++]) != NULL ){
              termVectorsWriter->openField(termVector->getField());
              const TCHAR** terms = termVector->getTerms();
              const int32_t* freqs = termVector->getTermFrequencies();
              
              int32_t t = 0;
              while ( terms[t] != NULL ){
                termVectorsWriter->addTerm(terms[t], freqs[t]);
                //todo: delete terms string return
                t++;
              }

              _CLDELETE(termVector);
            }
            _CLDELETE_ARRAY(sourceTermVector);
          }
          termVectorsWriter->closeDocument();
        }
      }
    }_CLFINALLY( _CLDELETE(termVectorsWriter); );
  }


  void SegmentMerger::mergeTerms() {
  //Func - Merge the terms of all segments
  //Pre  - fieldInfos != NULL
  //Post - The terms of all segments have been merged

	  CND_PRECONDITION(fieldInfos != NULL, "fieldInfos is NULL");

      try{
		  //create a filename for the new Frequency File for segment
          const char* buf = Misc::segmentname(segment,".frq");
		  //Open an IndexOutput to the new Frequency File
          freqOutput = directory->createOutput( buf );
          //Destroy the buffer of the filename
          _CLDELETE_CaARRAY(buf);
		
		  //create a filename for the new Prox File for segment
          buf = Misc::segmentname(segment,".prx");
		  //Open an IndexOutput to the new Prox File
          proxOutput = directory->createOutput( buf );
		  //delete buffer
          _CLDELETE_CaARRAY( buf );
		
		  //Instantiate  a new termInfosWriter which will write in directory

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -