⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segmentreader.cpp

📁 clucene是c++版的全文检索引擎,完全移植于lucene,采用 stl 编写.
💻 CPP
📖 第 1 页 / 共 2 页
字号:
/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"
#include "SegmentHeader.h"

#include "FieldInfos.h"
#include "FieldsReader.h"
#include "IndexReader.h"
#include "TermInfosReader.h"
#include "Terms.h"

CL_NS_USE(util)
CL_NS_USE(store)
CL_NS_USE(document)
CL_NS_DEF(index)

 SegmentReader::Norm::Norm(IndexInput* instrm, int32_t n, SegmentReader* r, const char* seg): 
   in(instrm), number(n), reader(r), segment(seg){
  //Func - Constructor
  //Pre  - instrm is a valid reference to an IndexInput
  //Post - A Norm instance has been created with an empty bytes array

	  bytes = NULL;
     dirty = false;
  }

  SegmentReader::Norm::~Norm() {
  //Func - Destructor
  //Pre  - true
  //Post - The IndexInput in has been deleted (and closed by its destructor) 
  //       and the array too.

      //Close and destroy the inputstream in-> The inputstream will be closed
      // by its destructor. Note that the IndexInput 'in' actually is a pointer!!!!!  
      _CLDELETE(in);

	  //Delete the bytes array
      _CLDELETE_ARRAY(bytes);

  }

  void SegmentReader::Norm::reWrite(){
      char buf[CL_MAX_PATH];
      char fileName[CL_MAX_PATH];
      sprintf(buf,"%s.tmp",segment);

      // NOTE: norms are re-written in regular directory, not cfs
      IndexOutput* out = reader->getDirectory()->createOutput(buf);
      try {
        out->writeBytes(bytes, reader->maxDoc());
      }_CLFINALLY( out->close(); _CLDELETE(out) );

      sprintf(fileName,"%s.f%d",segment,number);
      reader->getDirectory()->renameFile(buf, fileName);
      this->dirty = false;
    }

  SegmentReader::SegmentReader(SegmentInfo* si) : 
      //Init the superclass IndexReader
      IndexReader(si->getDir()),
	  _norms(false,false)
  { 
      initialize(si);
  }

  SegmentReader::SegmentReader(SegmentInfos* sis, SegmentInfo* si) : 
      //Init the superclass IndexReader
      IndexReader(si->getDir(),sis,false),
	  _norms(false,false)
  { 
      initialize(si);
  }

  void SegmentReader::initialize(SegmentInfo* si){
      //Pre  - si-> is a valid reference to SegmentInfo instance
      //       identified by si->
      //Post - All files of the segment have been read

      deletedDocs      = NULL;
	  //There are no documents yet marked as deleted
      deletedDocsDirty = false;
      
      normsDirty=false;
      undeleteAll=false;

	  //Duplicate the name of the segment from SegmentInfo to segment
      segment          = STRDUP_AtoA(si->name);
	  // make sure that all index files have been read or are kept open
      // so that if an index update removes them we'll still have them
      freqStream       = NULL;
      proxStream       = NULL;
      
	  //instantiate a buffer large enough to hold a directory path
      char buf[CL_MAX_PATH];

      // Use compound file directory for some files, if it exists
      Directory* cfsDir = getDirectory();
      SegmentName(buf, CL_MAX_PATH, ".cfs");
      if (cfsDir->fileExists(buf)) {
         cfsReader = _CLNEW CompoundFileReader(cfsDir, buf);
         cfsDir = cfsReader;
	  }else
		 cfsReader = NULL;

	  //Create the name of the field info file with suffix .fnm in buf
      SegmentName(buf, CL_MAX_PATH, ".fnm");
      fieldInfos = _CLNEW FieldInfos(cfsDir, buf );

      //Condition check to see if fieldInfos points to a valid instance
      CND_CONDITION(fieldInfos != NULL,"No memory could be allocated for fieldInfos");

	  //Create the name of the frequence file with suffix .frq in buf
      SegmentName(buf,CL_MAX_PATH, ".frq");

	  //Open an IndexInput freqStream to the frequency file
#ifdef LUCENE_FS_MMAP
	  if ( strcmp(cfsDir->getDirectoryType(), "FS") == 0 ){
		  FSDirectory* fsdir = (FSDirectory*)cfsDir;
		  freqStream = fsdir->openMMapFile( buf );
	  }else
#endif
		freqStream = cfsDir->openInput( buf );

      //Condition check to see if freqStream points to a valid instance and was able to open the
	  //frequency file
      CND_CONDITION(freqStream != NULL, "IndexInput freqStream could not open the frequency file");

	  //Create the name of the prox file with suffix .prx in buf
      SegmentName(buf, CL_MAX_PATH,".prx");

	  //Open an IndexInput proxStream to the prox file
      proxStream = cfsDir->openInput( buf );

	  //Condition check to see if proxStream points to a valid instance and was able to open the
	  //prox file
      CND_CONDITION(proxStream != NULL, "IndexInput proxStream could not open proximity file");

	  //Instantiate a FieldsReader for reading the Field Info File
      fieldsReader = _CLNEW FieldsReader(cfsDir, segment, fieldInfos);

      //Condition check to see if fieldsReader points to a valid instance 
      CND_CONDITION(fieldsReader != NULL,"No memory could be allocated for fieldsReader");

	  //Instantiate a TermInfosReader for reading the Term Dictionary .tis file
      tis = _CLNEW TermInfosReader(cfsDir, segment, fieldInfos);

      //Condition check to see if tis points to a valid instance 
      CND_CONDITION(tis != NULL,"No memory could be allocated for tis");

	  //Check if the segment has deletion according to the SegmentInfo instance si->
      // NOTE: the bitvector is stored using the regular directory, not cfs
      if (hasDeletions(si)){
		  //Create a deletion file with suffix .del          
          SegmentName(buf, CL_MAX_PATH,".del");
		  //Instantiate a BitVector that manages which documents have been deleted
          deletedDocs = _CLNEW BitSet(getDirectory(), buf );
       }

	  //Open the norm file. There's a norm file for each indexed field with a byte for each document. 
	  //The .f[0-9]* file contains, for each document, a byte that encodes a value 
	  //that is multiplied into the score for hits on that field
      openNorms(cfsDir);

      if (fieldInfos->hasVectors()) { // open term vector files only as needed
         termVectorsReader = _CLNEW TermVectorsReader(cfsDir, segment, fieldInfos);
      }else
		  termVectorsReader = NULL;
  }

  SegmentReader::~SegmentReader(){
  //Func - Destructor.
  //Pre  - doClose has been invoked!
  //Post - the instance has been destroyed

      doClose(); //this means that index reader doesn't need to be closed manually

      _CLDELETE(fieldInfos);
	  _CLDELETE(fieldsReader);
      _CLDELETE(tis);	      
 	  _CLDELETE(freqStream);
	  _CLDELETE(proxStream);
	  _CLDELETE_CaARRAY(segment);
	  _CLDELETE(deletedDocs);
     _CLDELETE(termVectorsReader)
     _CLDECDELETE(cfsReader);
  }

  void SegmentReader::doCommit(){
   char bufdel[CL_MAX_PATH];
   strcpy(bufdel,segment);
   strcat(bufdel,".del");

    if (deletedDocsDirty) {               // re-write deleted 
       char buftmp[CL_MAX_PATH];
       strcpy(buftmp,segment);
       strcat(buftmp,".tmp");
      deletedDocs->write(getDirectory(), buftmp);
      getDirectory()->renameFile(buftmp,bufdel);
    }
    if(undeleteAll && getDirectory()->fileExists(bufdel)){
      getDirectory()->deleteFile(bufdel);
    }
    if (normsDirty) {               // re-write norms 
	  CL_NS(util)::CLHashtable<const TCHAR*,Norm*,Compare::TChar,Equals::TChar>::iterator itr = _norms.begin();
		  Norm* norm;
      while (itr != _norms.end()) {
        norm = itr->second;
        if (norm->dirty) {
          norm->reWrite();
        }
        ++itr;
      }
    }
    deletedDocsDirty = false;
    normsDirty = false;
    undeleteAll = false;
  }
  
  void SegmentReader::doClose() {
  //Func - Closes all streams to the files of a single segment
  //Pre  - fieldsReader != NULL
  //       tis != NULL
  //Post - All streams to files have been closed

      CND_PRECONDITION(fieldsReader != NULL, "fieldsReader is NULL");
      CND_PRECONDITION(tis != NULL, "tis is NULL");

	  //Close the fieldsReader
      fieldsReader->close();
	  //Close the TermInfosReader
      tis->close();

	  //Close the frequency stream
	  if (freqStream != NULL){
          freqStream->close();
	  }
	  //Close the prox stream
	  if (proxStream != NULL){
         proxStream->close();
	   }

	  //Close the norm file
      closeNorms();
    
     if (termVectorsReader != NULL) 
        termVectorsReader->close();

     if (cfsReader != NULL)
         cfsReader->close();
  }

  bool SegmentReader::hasDeletions() {
      return deletedDocs != NULL;
  }

  //static 
  bool SegmentReader::usesCompoundFile(SegmentInfo* si) {
    char buf[CL_MAX_PATH];
    strcpy(buf,si->name);
    strcat(buf,".cfs");
    return si->getDir()->fileExists(buf);
  }
  
  //static
  bool SegmentReader::hasSeparateNorms(SegmentInfo* si) {
    char** result = si->getDir()->list();
    char pattern[CL_MAX_PATH];
    strcpy(pattern,si->name);
    strcat(pattern,".f");
    size_t patternLength = strlen(pattern);

    int32_t i=0;
    char* res=NULL;
	bool ret=false;
    while ( (res=result[i]) != NULL ){
		if ( !ret ){
			if ( strlen(res)>patternLength && strncmp(res,pattern,patternLength) == 0 ){
				if ( res[patternLength] >= '0' && res[patternLength] <= '9' )
					ret=true;
			}
		}
	  _CLDELETE_CaARRAY(result[i]);
	  ++i;
    }
	_CLDELETE_ARRAY(result);
    return ret;
  }

  bool SegmentReader::hasDeletions(const SegmentInfo* si) {
  //Func - Static method
  //       Checks if a segment managed by SegmentInfo si-> has deletions
  //Pre  - si-> holds a valid reference to an SegmentInfo instance
  //Post - if the segement contains deleteions true is returned otherwise flas

	  //Create a buffer f of length CL_MAX_PATH
      char f[CL_MAX_PATH];
      //SegmentReader::segmentname(f, si->name,_T(".del"),-1 );
      //create the name of the deletion file
	  Misc::segmentname(f,CL_MAX_PATH, si->name,".del",-1 );
	  //Check if the deletion file exists and return the result
      return si->getDir()->fileExists( f );
  }

	//synchronized
  void SegmentReader::doDelete(const int32_t docNum){
  //Func - Marks document docNum as deleted
  //Pre  - docNum >=0 and DocNum < maxDoc() 
  //       docNum contains the number of the document that must be 
  //       marked deleted
  //Post - The document identified by docNum has been marked deleted

      SCOPED_LOCK_MUTEX(THIS_LOCK)
      
     CND_PRECONDITION(docNum >= 0, "docNum is a negative number");
     CND_PRECONDITION(docNum < maxDoc(), "docNum is bigger than the total number of documents");

	  //Check if deletedDocs exists
	  if (deletedDocs == NULL){
          deletedDocs = _CLNEW BitSet(maxDoc());

          //Condition check to see if deletedDocs points to a valid instance
          CND_CONDITION(deletedDocs != NULL,"No memory could be allocated for deletedDocs");
	  }
      //Flag that there are documents marked deleted
      deletedDocsDirty = true;
      undeleteAll = false;
      //Mark document identified by docNum as deleted
      deletedDocs->set(docNum);

  }

  void SegmentReader::doUndeleteAll(){
      _CLDELETE(deletedDocs);
      deletedDocsDirty = false;
      undeleteAll = true;
  }

  AStringArrayConstWithDeletor* SegmentReader::files() {
  //Func - Returns all file names managed by this SegmentReader
  //Pre  - segment != NULL
  //Post - All filenames managed by this SegmentRead have been returned
 
     CND_PRECONDITION(segment != NULL, "segment is NULL");

	  AStringArrayConstWithDeletor* files = _CLNEW AStringArrayConstWithDeletor(true);

     //Condition check to see if files points to a valid instance
     CND_CONDITION(files != NULL, "No memory could be allocated for files");

     char* temp = NULL;
     #define _ADD_SEGMENT(ext) temp = SegmentName( ext ); if ( getDirectory()->fileExists(temp) ) files->push_back(temp); else _CLDELETE_CaARRAY(temp);
								

     //Add the name of the Field Info file
     _ADD_SEGMENT(".cfs" );
     _ADD_SEGMENT(".fnm" );
     _ADD_SEGMENT(".fdx" );
     _ADD_SEGMENT(".fdt" );
     _ADD_SEGMENT(".tii" );
     _ADD_SEGMENT(".tis" );
     _ADD_SEGMENT(".frq" );
     _ADD_SEGMENT(".prx" );
     _ADD_SEGMENT(".del" );
     _ADD_SEGMENT(".tvx" );
     _ADD_SEGMENT(".tvd" );
     _ADD_SEGMENT(".tvf" );
     _ADD_SEGMENT(".tvp" );

      //iterate through the field infos
			FieldInfo* fi;
      for (int32_t i = 0; i < fieldInfos->size(); ++i) {
          //Get the field info for the i-th field   
          fi = fieldInfos->fieldInfo(i);
          //Check if the field has been indexed
          if (fi->isIndexed){
              //The field has been indexed so add its norm file
              files->push_back( SegmentName(".f", i) );
          }
       }

    return files;
  }

  TermEnum* SegmentReader::terms() const {
  //Func - Returns an enumeration of all the Terms and TermInfos in the set. 
  //Pre  - tis != NULL
  //Post - An enumeration of all the Terms and TermInfos in the set has been returned

      CND_PRECONDITION(tis != NULL, "tis is NULL");

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -