⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 indexwriter.cpp

📁 clucene是c++版的全文检索引擎,完全移植于lucene,采用 stl 编写.
💻 CPP
📖 第 1 页 / 共 2 页
字号:
	//Flush the RamSegments to disk
    flushRamSegments();
    while (segmentInfos->size() > 1 ||
		(segmentInfos->size() == 1 &&
			(SegmentReader::hasDeletions(segmentInfos->info(0)) ||
            segmentInfos->info(0)->getDir()!=directory ||
            (useCompoundFile &&
              (!SegmentReader::usesCompoundFile(segmentInfos->info(0)) ||
                SegmentReader::hasSeparateNorms(segmentInfos->info(0))))))) {

		int32_t minSegment = segmentInfos->size() - mergeFactor;

		mergeSegments(minSegment < 0 ? 0 : minSegment);
	}
  }


  char* IndexWriter::newSegmentName() {
    SCOPED_LOCK_MUTEX(THIS_LOCK)

    TCHAR buf[9];
    _i64tot(segmentInfos->counter++,buf,36); //36 is RADIX of 10 digits and 26 numbers

	int32_t rlen = _tcslen(buf) + 2;
	char* ret = _CL_NEWARRAY(char,rlen);
	strcpy(ret,"_");
	STRCPY_TtoA(ret+1,buf,rlen-1); //write at 2nd character, for a maximum of 9 characters
    return ret;
  }

  void IndexWriter::flushRamSegments() {
  //Func - Merges all RAM-resident segments.
  //Pre  - ramDirectory != NULL
  //Post - The RAM-resident segments have been merged to disk

	CND_PRECONDITION(ramDirectory != NULL, "ramDirectory is NULL");

    int32_t minSegment = segmentInfos->size()-1; //don't make this unsigned...
	CND_CONDITION(minSegment >= -1, "minSegment must be >= -1");

	int32_t docCount = 0;
	//Iterate through all the segements and check if the directory is a ramDirectory
	while (minSegment >= 0 &&
	  segmentInfos->info(minSegment)->getDir() == ramDirectory) {
	  docCount += segmentInfos->info(minSegment)->docCount;
	  minSegment--;
	}
	if (minSegment < 0 ||			  // add one FS segment?
		(docCount + segmentInfos->info(minSegment)->docCount) > mergeFactor ||
		!(segmentInfos->info(segmentInfos->size()-1)->getDir() == ramDirectory))
	  minSegment++;

	CND_CONDITION(minSegment >= 0, "minSegment must be >= 0");
	if (minSegment >= segmentInfos->size())
	  return;					  // none to merge
	mergeSegments(minSegment);
  }

  void IndexWriter::maybeMergeSegments() {
  //Func - Incremental Segment Merger
  //Pre  -
  //Post -

	int64_t targetMergeDocs = minMergeDocs;

	// find segments smaller than current target size
	while (targetMergeDocs <= maxMergeDocs) {
		int32_t minSegment = segmentInfos->size();
		int32_t mergeDocs = 0;

		while (--minSegment >= 0) {
			SegmentInfo* si = segmentInfos->info(minSegment);
			if (si->docCount >= targetMergeDocs)
				break;
			mergeDocs += si->docCount;
		}

		if (mergeDocs >= targetMergeDocs){
			// found a merge to do
			mergeSegments(minSegment+1);
		}else
			break;

		//increase target size
		targetMergeDocs *= mergeFactor;
	}
  }


  void IndexWriter::mergeSegments(const uint32_t minSegment) {
    CLVector<SegmentReader*> segmentsToDelete(false);
    const char* mergedName = newSegmentName();
#ifdef _CL_DEBUG_INFO
	fprintf(_CL_DEBUG_INFO, "merging segments\n");
#endif
    SegmentMerger merger(directory, mergedName, useCompoundFile);
    for (int32_t i = minSegment; i < segmentInfos->size(); i++) {
      SegmentInfo* si = segmentInfos->info(i);
#ifdef _CL_DEBUG_INFO
	  fprintf(_CL_DEBUG_INFO, " %s (%d docs)\n",si->name,si->docCount);
#endif
      SegmentReader* reader = _CLNEW SegmentReader(si);
      merger.add(reader);
      if ((reader->getDirectory() == this->directory) || // if we own the directory
		(reader->getDirectory() == this->ramDirectory)){
        segmentsToDelete.push_back((SegmentReader*)reader);	  // queue segment for deletion
	  }
    }

    int32_t mergedDocCount = merger.merge();

#ifdef _CL_DEBUG_INFO
	 fprintf(_CL_DEBUG_INFO,"\n into %s (%d docs)\n",mergedName, mergedDocCount);
#endif
	  
	segmentInfos->clearto(minSegment); // pop old infos & add new
    segmentInfos->add( _CLNEW SegmentInfo(mergedName, mergedDocCount, directory));


    // close readers before we attempt to delete now-obsolete segments
    merger.closeReaders();

    LuceneLock* lock = directory->makeLock("commit.lock");
    IndexWriterLockWith2 with ( lock,LUCENE_COMMIT_LOCK_TIMEOUT,this,&segmentsToDelete );

    {
    	SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync
    	with.run();
    }

    _CLDELETE( lock );
    _CLDELETE_CaARRAY( mergedName ); //ADD:
  }

  void IndexWriter::deleteSegments(CLVector<SegmentReader*>* segments) {
    AStringArrayConstWithDeletor deletable;

    AStringArrayConstWithDeletor* deleteArray = readDeleteableFiles();
    deleteFiles(deleteArray, &deletable); // try to delete deleteable
    _CLDELETE(deleteArray);

    for (uint32_t i = 0; i < segments->size(); i++) {
      SegmentReader* reader = (*segments)[i];
      AStringArrayConstWithDeletor* files = reader->files();
      if (reader->getDirectory() == this->directory)
        deleteFiles(files, &deletable);	  // try to delete our files
      else
        deleteFiles(files, reader->getDirectory()); // delete, eg, RAM files

      _CLDELETE(files);
    }

    writeDeleteableFiles(&deletable);		  // note files we can't delete
  }

  AStringArrayConstWithDeletor* IndexWriter::readDeleteableFiles() {
    AStringArrayConstWithDeletor* result = _CLNEW AStringArrayConstWithDeletor;

    if (!directory->fileExists("deletable"))
      return result;

    IndexInput* input = directory->openInput("deletable");
    try {
		TCHAR tname[CL_MAX_PATH];
		for (int32_t i = input->readInt(); i > 0; i--){	  // read file names
			input->readString(tname,CL_MAX_PATH);
			result->push_back(STRDUP_TtoA(tname));
		}
    } _CLFINALLY(
        input->close();
        _CLDELETE(input);
    );


    return result;
  }

  void IndexWriter::writeDeleteableFiles(AStringArrayConstWithDeletor* files) {
    IndexOutput* output = directory->createOutput("deleteable.new");
    try {
      output->writeInt(files->size());
	  TCHAR tfile[CL_MAX_PATH]; //temporary space for tchar file name
	  for (uint32_t i = 0; i < files->size(); i++){
		STRCPY_AtoT(tfile,(*files)[i],CL_MAX_PATH);
        output->writeString( tfile, _tcslen(tfile) );
	  }
    } _CLFINALLY(
        output->close();
        _CLDELETE(output);
    );

    directory->renameFile("deleteable.new", "deletable");
  }

  void IndexWriter::deleteFiles(AStringArrayConstWithDeletor* files, Directory* directory) {
	AStringArrayConstWithDeletor::const_iterator itr = files->begin();
	while ( itr != files->end() ){
		directory->deleteFile( *itr );
		++itr;
	}
  }

  void IndexWriter::deleteFiles(AStringArrayConstWithDeletor* files, AStringArrayConstWithDeletor* deletable) {
	  AStringArrayConstWithDeletor::const_iterator itr=files->begin();
	  while ( itr != files->end() ){
		const char* file = *itr;
		try {
			if ( directory->fileExists(file) )
				directory->deleteFile(file);		  // try to delete each file
		} catch (CLuceneError& err) {			  // if delete fails
		    if ( err.number() != CL_ERR_IO )
		        throw err; //not an IO err... re-throw

			if (directory->fileExists(file)) {
	#ifdef _CL_DEBUG_INFO
				fprintf(_CL_DEBUG_INFO,"%s; Will re-try later.\n", err.what());
	#endif
			deletable->push_back(STRDUP_AtoA(file));		  // add to deletable
			}
		}
	  ++itr;
	 }
  }


  
  void IndexWriter::addIndexes(Directory** dirs) {
  //Func - Add several indexes located in different directories into the current
  //       one managed by this instance
  //Pre  - dirs != NULL and contains directories of several indexes
  //       dirsLength > 0 and contains the number of directories
  //Post - The indexes located in the directories in dirs have been merged with
  //       the pre(current) index. The Resulting index has also been optimized

	  SCOPED_LOCK_MUTEX(THIS_LOCK)
	  
	  CND_PRECONDITION(dirs != NULL, "dirs is NULL");

	  // start with zero or 1 seg so optimize the current
	  optimize();

	  //Iterate through the directories
     int32_t i = 0;
	  while ( dirs[i] != NULL ) {
		  // DSR: Changed SegmentInfos constructor arg (see bug discussion below).
		  SegmentInfos sis(false);
		  sis.read( dirs[i]);

		  for (int32_t j = 0; j < sis.size(); j++) {
		   /* DSR:CL_BUG:
		   ** In CLucene 0.8.11, the next call placed a pointer to a SegmentInfo
		   ** object from stack variable $sis into the vector this->segmentInfos.
		   ** Then, when the call to optimize() is made just before exiting this
		   ** function, $sis had already been deallocated (and has deleted its
		   ** member objects), leaving dangling pointers in this->segmentInfos.
		   ** I added a SegmentInfos constructor that allowed me to order it not
		   ** to delete its members, invoked the new constructor form above for
		   ** $sis, and the problem was solved. */
		   segmentInfos->add(sis.info(j));	  // add each info
		  }
        i++;
	}
	optimize();					  // cleanup
  }


  void IndexWriter::addIndexes(IndexReader** readers){
	 SCOPED_LOCK_MUTEX(THIS_LOCK)
    optimize();					  // start with zero or 1 seg

    char* mergedName = newSegmentName();
    SegmentMerger* merger = _CLNEW SegmentMerger(directory, mergedName, false);

    if (segmentInfos->size() == 1)                 // add existing index, if any
      merger->add(_CLNEW SegmentReader(segmentInfos->info(0)));

    int32_t readersLength = 0;
    while ( readers[readersLength] != NULL )
      merger->add((SegmentReader*) readers[readersLength++]);

    int32_t docCount = merger->merge();                // merge 'em

    // pop old infos & add new
	segmentInfos->clearto(0);
    segmentInfos->add(_CLNEW SegmentInfo(mergedName, docCount, directory));

    LuceneLock* lock = directory->makeLock("commit.lock");
    IndexWriterLockWith with ( lock,LUCENE_COMMIT_LOCK_TIMEOUT,this,true);

	{
		SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync
	   	with.run();
	}

      _CLDELETE(lock);
   }

CL_NS_END

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -