📄 segmentreader.cpp

📁 clucene是c++版的全文检索引擎,完全移植于lucene,采用 stl 编写.
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
      return tis->terms();
  }

  TermEnum* SegmentReader::terms(const Term* t) const {
  //Func - Returns an enumeration of terms starting at or after the named term t 
  //Pre  - t != NULL
  //       tis != NULL
  //Post - An enumeration of terms starting at or after the named term t 

      CND_PRECONDITION(t   != NULL, "t is NULL");
      CND_PRECONDITION(tis != NULL, "tis is NULL");

      return tis->terms(t);
  }

  Document* SegmentReader::document(const int32_t n) {
  //Func - Returns a document identified by n
  //Pre  - n >=0 and identifies the document n
  //Post - if the document has been deleted then an exception has been thrown
  //       otherwise a reference to the found document has been returned

      SCOPED_LOCK_MUTEX(THIS_LOCK)
      
      CND_PRECONDITION(n >= 0, "n is a negative number");

	  //Check if the n-th document has been marked deleted
       if (isDeleted(n)){
          _CLTHROWA( CL_ERR_InvalidState,"attempt to access a deleted document" );
          }

	   //Retrieve the n-th document
       Document* ret = fieldsReader->doc(n);

       //Condition check to see if ret points to a valid instance
       CND_CONDITION(ret != NULL, "No document could be retrieved");

	   //Return the document
       return ret;
  }


  bool SegmentReader::isDeleted(const int32_t n) {
  //Func - Checks if the n-th document has been marked deleted
  //Pre  - n >=0 and identifies the document n
  //Post - true has been returned if document n has been deleted otherwise fralse

      SCOPED_LOCK_MUTEX(THIS_LOCK)
      
      CND_PRECONDITION(n >= 0, "n is a negative number");

	  //Is document n deleted
      bool ret = (deletedDocs != NULL && deletedDocs->get(n));

      return ret;
  }

  TermDocs* SegmentReader::termDocs() const {
  //Func - Returns an unpositioned TermDocs enumerator. 
  //Pre  - true
  //Post - An unpositioned TermDocs enumerator has been returned

       return _CLNEW SegmentTermDocs(this);
  }

  TermPositions* SegmentReader::termPositions() const {
  //Func - Returns an unpositioned TermPositions enumerator. 
  //Pre  - true
  //Post - An unpositioned TermPositions enumerator has been returned

      return _CLNEW SegmentTermPositions(this);
  }

  int32_t SegmentReader::docFreq(const Term* t) const {
  //Func - Returns the number of documents which contain the term t
  //Pre  - t holds a valid reference to a Term
  //Post - The number of documents which contain term t has been returned

      //Get the TermInfo ti for Term  t in the set
      TermInfo* ti = tis->get(t);
      //Check if an TermInfo has been returned
      if (ti){
		  //Get the frequency of the term
          int32_t ret = ti->docFreq;
		  //TermInfo ti is not needed anymore so delete it
          _CLDELETE( ti );
		  //return the number of documents which containt term t
          return ret;
          }
	  else
		  //No TermInfo returned so return 0
          return 0;
  }

  int32_t SegmentReader::numDocs() {
  //Func - Returns the actual number of documents in the segment
  //Pre  - true
  //Post - The actual number of documents in the segments

	  //Get the number of all the documents in the segment including the ones that have 
	  //been marked deleted
      int32_t n = maxDoc();

	  //Check if there any deleted docs
      if (deletedDocs != NULL)
		  //Substract the number of deleted docs from the number returned by maxDoc
          n -= deletedDocs->count();

	  //return the actual number of documents in the segment
      return n;
  }

  int32_t SegmentReader::maxDoc() const {
  //Func - Returns the number of  all the documents in the segment including
  //       the ones that have been marked deleted
  //Pre  - true
  //Post - The total number of documents in the segment has been returned

      return fieldsReader->size();
  }


  void SegmentReader::norms(const TCHAR* field, uint8_t* bytes) {
  //Func - Reads the Norms for field from disk starting at offset in the inputstream
  //Pre  - field != NULL
  //       bytes != NULL is an array of bytes which is to be used to read the norms into.
  //       it is advisable to have bytes initalized by zeroes!
  //Post - The if an inputstream to the norm file could be retrieved the bytes have been read
  //       You are never sure whether or not the norms have been read into bytes properly!!!!!!!!!!!!!!!!!

    CND_PRECONDITION(field != NULL, "field is NULL");
    CND_PRECONDITION(bytes != NULL, "field is NULL");

	 SCOPED_LOCK_MUTEX(THIS_LOCK)
    
    Norm* norm = _norms.get(field);
    if ( norm == NULL )
       return;					  // use zeros in array

   if (norm->bytes != NULL) { // can copy from cache
      memcpy(bytes,norm->bytes,maxDoc());
      return;
    }

   IndexInput* _normStream = norm->in->clone();
   CND_PRECONDITION(_normStream != NULL, "normStream==NULL")

    // read from disk
    try{ 
       _normStream->seek(0);
       _normStream->readBytes(bytes, maxDoc());
    }_CLFINALLY(
        //Have the normstream closed
        _normStream->close();
        //Destroy the normstream
        _CLDELETE( _normStream );
	);
	
  }

  uint8_t* SegmentReader::norms(const TCHAR* field) {
  //Func - Returns the bytes array that holds the norms of a named field
  //Pre  - field != NULL and contains the name of the field for which the norms 
  //       must be retrieved
  //Post - If there was norm for the named field then a bytes array has been allocated 
  //       and returned containing the norms for that field. If the named field is unknown NULL is returned.

    CND_PRECONDITION(field != NULL, "field is NULL");
    
    
	  SCOPED_LOCK_MUTEX(THIS_LOCK)

    //Try to retrieve the norms for field
    Norm* norm = (Norm*)_norms.get(field);
    //Check if a norm instance was found
    if (norm == NULL){
        //return NULL as there are no norms to be returned
        return NULL;
	}

    if (norm->bytes == NULL) { //value not read yet
        //allocate a new bytes array to hold the norms
        uint8_t* bytes = _CL_NEWARRAY(uint8_t,maxDoc()); 

        //Condition check to see if bytes points to a valid array
        CND_CONDITION(bytes != NULL, "bytes is NULL");

        //Read the norms from disk straight into the new bytes array
        norms(field, bytes);
        norm->bytes = bytes; // cache it
    }

		//Return the norms
		return norm->bytes;
  }

  void SegmentReader::doSetNorm(int32_t doc, const TCHAR* field, uint8_t value){
    Norm* norm = _norms.get(field);
    if (norm == NULL)                             // not an indexed field
      return;
    norm->dirty = true;                            // mark it dirty
    normsDirty = true;

    uint8_t* bits = norms(field);
    bits[doc] = value;                    // set the value
  }


  char* SegmentReader::SegmentName(const char* ext, const int32_t x){
  //Func - Returns an allocated buffer in which it creates a filename by 
  //       concatenating segment with ext and x
  //Pre    ext != NULL and holds the extension
  //       x contains a number
  //Post - A buffer has been instantiated an when x = -1 buffer contains the concatenation of 
  //       segment and ext otherwise buffer contains the contentation of segment, ext and x
      
	  CND_PRECONDITION(ext     != NULL, "ext is NULL");

	  //Create a buffer of length CL_MAX_PATH
	  char* buf = _CL_NEWARRAY(char,CL_MAX_PATH);
	  //Create the filename
      SegmentName(buf,CL_MAX_PATH,ext,x);
	  
      return buf ;
  }

  void SegmentReader::SegmentName(char* buffer,int32_t bufferLen, const char* ext, const int32_t x ){
  //Func - Creates a filename in buffer by concatenating segment with ext and x
  //Pre  - buffer != NULL
  //       ext    != NULL
  //       x contains a number
  //Post - When x = -1 buffer contains the concatenation of segment and ext otherwise
  //       buffer contains the contentation of segment, ext and x

      CND_PRECONDITION(buffer  != NULL, "buffer is NULL");
      CND_PRECONDITION(segment != NULL, "Segment is NULL");

      Misc::segmentname(buffer,bufferLen,segment,ext,x);
  }
  void SegmentReader::openNorms(Directory* cfsDir) {
  //Func - Open all norms files for all fields
  //       Creates for each field a norm Instance with an open inputstream to 
  //       a corresponding norm file ready to be read
  //Pre  - true
  //Post - For each field a norm instance has been created with an open inputstream to
  //       a corresponding norm file ready to be read

      //Iterate through all the fields
			FieldInfo* fi;
			Directory* d;
      for (int32_t i = 0; i < fieldInfos->size(); ++i) {
		  //Get the FieldInfo for the i-th field
          fi = fieldInfos->fieldInfo(i);
          //Check if the field is indexed
          if (fi->isIndexed) {
		      //Allocate a buffer
              char fileName[CL_MAX_PATH];
			  //Create a filename for the norm file
              SegmentName(fileName,CL_MAX_PATH, ".f", fi->number);
              //TODO, should fi->name be copied?
			  //Create a new Norm with an open inputstream to f and store
			  //it at fi->name in norms
             d = getDirectory();
             if ( !d->fileExists(fileName) )
                d = cfsDir;

                 _norms.put(fi->name, _CLNEW Norm( d->openInput( fileName ),fi->number, this, segment ));
          }
      }
  }

  void SegmentReader::closeNorms() {
  //Func - Close all the norms stored in norms
  //Pre  - true
  //Post - All the norms have been destroyed

    SCOPED_LOCK_MUTEX(_norms.THIS_LOCK)
	//Create an interator initialized at the beginning of norms
	CL_NS(util)::CLHashtable<const TCHAR*,Norm*,Compare::TChar,Equals::TChar>::iterator itr = _norms.begin();
	//Iterate through all the norms
    while (itr != _norms.end()) {
        //Get the norm
        Norm* n = itr->second;
        //delete the norm n
        _CLDELETE(n);
        //Move the interator to the next norm in the norms collection.
	    //Note ++ is an overloaded operator
        ++itr;
     }
    _norms.clear(); //bvk: they're deleted, so clear them so that they are not re-used
  }

    /**
   * @see IndexReader#getFieldNames()
   */
  TCHAR** SegmentReader::getFieldNames(){
    // maintain a unique set of field names
	int32_t len = fieldInfos->size();
    TCHAR** ret = _CL_NEWARRAY(TCHAR*,len+1);
	int32_t i = 0;
	int32_t p = 0;
	FieldInfo* fi;
	int32_t j;
    for (i = 0; i < len; ++i) {
      fi = fieldInfos->fieldInfo(i);
	   for ( j =0;j<i;++j )
		  if ( _tcscmp(fi->name,ret[j]) == 0 )
			  continue;
      ret[p++]=STRDUP_TtoT(fi->name);
    }
    ret[p]=NULL;
    return ret;
  }

  /**
   * @see IndexReader#getFieldNames(boolean)
   */
  TCHAR** SegmentReader::getFieldNames(bool indexed) {
    // maintain a unique set of field names
    CL_NS(util)::CLSetList<const TCHAR*> fieldSet(false);
	int32_t i = 0;
	FieldInfo* fi;
    for (i = 0; i < fieldInfos->size(); ++i) {
      fi = fieldInfos->fieldInfo(i);
	   if (fi->isIndexed == indexed){
		 if ( fieldSet.find(fi->name)==fieldSet.end() )
			  fieldSet.insert(fi->name);
	   }
    }
    
	 TCHAR** ret = _CL_NEWARRAY(TCHAR*,fieldSet.size()+1);
    int j=0;
    CL_NS(util)::CLSetList<const TCHAR*>::iterator itr = fieldSet.begin();
    while ( itr != fieldSet.end() ){
        const TCHAR* t = *itr;
        ret[j]=STRDUP_TtoT(t);

        ++j;
        ++itr;
    }
    ret[fieldSet.size()]=NULL;
    return ret;
  }

  /**
   * 
   * @param storedTermVector if true, returns only Indexed fields that have term vector info, 
   *                        else only indexed fields without term vector info 
   * @return Collection of Strings indicating the names of the fields
   */
   TCHAR** SegmentReader::getIndexedFieldNames(bool storedTermVector) {
    // maintain a unique set of field names
    CL_NS(util)::CLSetList<const TCHAR*> fieldSet(false);
	 int32_t i = 0;
	  FieldInfo* fi;
    for (i = 0; i < fieldInfos->size(); ++i) {
      fi = fieldInfos->fieldInfo(i);
      if (fi->isIndexed == true && fi->storeTermVector == storedTermVector){
		if ( fieldSet.find((const TCHAR*)fi->name)==fieldSet.end() )
			  fieldSet.insert(fi->name);
      }
    }
	 TCHAR** ret = _CL_NEWARRAY(TCHAR*,fieldSet.size()+1);
    int j=0;
    CL_NS(util)::CLSetList<const TCHAR*>::iterator itr = fieldSet.begin();
    while ( itr != fieldSet.end() ){
        const TCHAR* t = *itr;
        ret[j]=STRDUP_TtoT(t);

        ++j;
        ++itr;
    }
    ret[fieldSet.size()]=NULL;
    return ret;

  }

   TermFreqVector* SegmentReader::getTermFreqVector(int32_t docNumber, const TCHAR* field){
    // Check if this field is invalid or has no stored term vector
    FieldInfo* fi = fieldInfos->fieldInfo(field);
    if (fi == NULL || !fi->storeTermVector) 
       return NULL;

    return termVectorsReader->get(docNumber, field);
  }


   TermFreqVector** SegmentReader::getTermFreqVectors(int32_t docNumber){
    if (termVectorsReader == NULL)
      return NULL;

    return termVectorsReader->get(docNumber);
  }
CL_NS_END
上一页 12
💿 文件大小 2052 K
👤 上传用户 managerliu123
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#clucene #lucene #stl #检索
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -