📄 segmentmerger.cpp

📁 clucene是c++版的全文检索引擎,完全移植于lucene,采用 stl 编写.
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
		  //for the segment name segment using the new merged fieldInfos
          termInfosWriter = _CLNEW TermInfosWriter(directory, segment, fieldInfos);  
          
          //Condition check to see if termInfosWriter points to a valid instance
          CND_CONDITION(termInfosWriter != NULL,"Memory allocation for termInfosWriter failed")	;
          
		  skipInterval = termInfosWriter->skipInterval;
          queue = _CLNEW SegmentMergeQueue(readers.size());


		  //And merge the Term Infos
          mergeTermInfos();	      
      }_CLFINALLY(
		  //Close and destroy the IndexOutput to the Frequency File
          if (freqOutput != NULL) 		{ freqOutput->close(); _CLDELETE(freqOutput); }
          //Close and destroy the IndexOutput to the Prox File
          if (proxOutput != NULL) 		{ proxOutput->close(); _CLDELETE(proxOutput); }
		  //Close and destroy the termInfosWriter
          if (termInfosWriter != NULL) 	{ termInfosWriter->close(); _CLDELETE(termInfosWriter); }
		  //Close and destroy the queue
          if (queue != NULL)            { queue->close(); _CLDELETE(queue);}
	  );
  }

  void SegmentMerger::mergeTermInfos(){
  //Func - Merges all TermInfos into a single segment
  //Pre  - true
  //Post - All TermInfos have been merged into a single segment

      //Condition check to see if queue points to a valid instance
      CND_CONDITION(queue != NULL, "Memory allocation for queue failed")	;

	  //base is the id of the first document in a segment
      int32_t base = 0;

      IndexReader* reader = NULL;
	   SegmentMergeInfo* smi = NULL;

	  //iterate through all the readers
      for (uint32_t i = 0; i < readers.size(); i++) {
		  //Get the i-th reader
          reader = readers[i];

          //Condition check to see if reader points to a valid instance
          CND_CONDITION(reader != NULL, "No SegmentReader found");

		    //Get the term enumeration of the reader
          TermEnum* termEnum = reader->terms();
          //Instantiate a new SegmentMerginfo for the current reader and enumeration
          smi = _CLNEW SegmentMergeInfo(base, termEnum, reader);

          //Condition check to see if smi points to a valid instance
          CND_CONDITION(smi != NULL, "Memory allocation for smi failed")	;

		  //Increase the base by the number of documents that have not been marked deleted
		  //so base will contain a new value for the first document of the next iteration
          base += reader->numDocs();
		  //Get the next current term
		  if (smi->next()){
              //Store the SegmentMergeInfo smi with the initialized SegmentTermEnum TermEnum
			  //into the queue
              queue->put(smi);
          }else{
			  //Apparently the end of the TermEnum of the SegmentTerm has been reached so
			  //close the SegmentMergeInfo smi
              smi->close();
			  //And destroy the instance and set smi to NULL (It will be used later in this method)
              _CLDELETE(smi);
              }
          }

	  //Instantiate an array of SegmentMergeInfo instances called match
      SegmentMergeInfo** match = _CL_NEWARRAY(SegmentMergeInfo*,readers.size()+1);

      //Condition check to see if match points to a valid instance
      CND_CONDITION(match != NULL, "Memory allocation for match failed")	;
     
      SegmentMergeInfo* top = NULL;

      //As long as there are SegmentMergeInfo instances stored in the queue
      while (queue->size() > 0) {
          int32_t matchSize = 0;			  
		  
		  // pop matching terms
          
		  //Pop the first SegmentMergeInfo from the queue
          match[matchSize++] = queue->pop();
		  //Get the Term of match[0]
          Term* term = match[0]->term;
			  
          //Condition check to see if term points to a valid instance
          CND_CONDITION(term != NULL,"term is NULL")	;

          //Get the current top of the queue
		  top = queue->top();

          //For each SegmentMergInfo still in the queue 
		  //Check if term matches the term of the SegmentMergeInfo instances in the queue
          while (top != NULL && term->equals(top->term) ){ //todo: changed to equals, but check if this is more efficient
			  //A match has been found so add the matching SegmentMergeInfo to the match array
              match[matchSize++] = queue->pop();
			  //Get the next SegmentMergeInfo
              top = queue->top();
          }
		  match[matchSize]=NULL;

		  //add new TermInfo
          mergeTermInfo(match); //matchSize  
		  
          //Restore the SegmentTermInfo instances in the match array back into the queue
          while (matchSize > 0){
              smi = match[--matchSize];
			  
              //Condition check to see if smi points to a valid instance
              CND_CONDITION(smi != NULL,"smi is NULL")	;

			  //Move to the next term in the enumeration of SegmentMergeInfo smi
			  if (smi->next()){
                  //There still are some terms so restore smi in the queue
                  queue->put(smi);
				  
              }else{
				  //Done with a segment
				  //No terms anymore so close this SegmentMergeInfo instance
                  smi->close();				  
                  _CLDELETE( smi );
              }
          }
     }

     _CLDELETE_ARRAY(match);
  }

  void SegmentMerger::mergeTermInfo( SegmentMergeInfo** smis){
  //Func - Merge the TermInfo of a term found in one or more segments. 
  //Pre  - smis != NULL and it contains segments that are positioned at the same term.
  //       n is equal to the number of SegmentMergeInfo instances in smis
  //       freqOutput != NULL
  //       proxOutput != NULL
  //Post - The TermInfo of a term has been merged

	  CND_PRECONDITION(smis != NULL, "smis is NULL");
	  CND_PRECONDITION(freqOutput != NULL, "freqOutput is NULL");
	  CND_PRECONDITION(proxOutput != NULL, "proxOutput is NULL");

	  //Get the file pointer of the IndexOutput to the Frequency File
      int64_t freqPointer = freqOutput->getFilePointer();
	  //Get the file pointer of the IndexOutput to the Prox File
      int64_t proxPointer = proxOutput->getFilePointer();

      //Process postings from multiple segments all positioned on the same term.
      int32_t df = appendPostings(smis);  

      int64_t skipPointer = writeSkip();

	  //df contains the number of documents across all segments where this term was found
      if (df > 0) {
          //add an entry to the dictionary with pointers to prox and freq files
          termInfo.set(df, freqPointer, proxPointer, (int32_t)(skipPointer - freqPointer));
          //Precondition check for to be sure that the reference to
		  //smis[0]->term will be valid
          CND_PRECONDITION(smis[0]->term != NULL, "smis[0]->term is NULL");
		  //Write a new TermInfo
          termInfosWriter->add(smis[0]->term, &termInfo);
       }
  }
	       
  
  int32_t SegmentMerger::appendPostings(SegmentMergeInfo** smis){
  //Func - Process postings from multiple segments all positioned on the
  //       same term. Writes out merged entries into freqOutput and
  //       the proxOutput streams.
  //Pre  - smis != NULL and it contains segments that are positioned at the same term.
  //       n is equal to the number of SegmentMergeInfo instances in smis
  //       freqOutput != NULL
  //       proxOutput != NULL
  //Post - Returns number of documents across all segments where this term was found

      CND_PRECONDITION(smis != NULL, "smis is NULL");
	  CND_PRECONDITION(freqOutput != NULL, "freqOutput is NULL");
	  CND_PRECONDITION(proxOutput != NULL, "proxOutput is NULL");

      int32_t lastDoc = 0;
      int32_t df = 0;       //Document Counter

       resetSkip();
      SegmentMergeInfo* smi = NULL;

	  //Iterate through all SegmentMergeInfo instances in smis
      int32_t i = 0;
      while ( (smi=smis[i]) != NULL ){
		  //Get the i-th SegmentMergeInfo 

          //Condition check to see if smi points to a valid instance
		  CND_PRECONDITION(smi!=NULL,"	 is NULL");

		  //Get the term positions 
          TermPositions* postings = smi->postings;
		  //Get the base of this segment
          int32_t base = smi->base;
		  //Get the docMap so we can see which documents have been deleted
          int32_t* docMap = smi->docMap;
		    //Seek the termpost
          postings->seek(smi->termEnum);
          while (postings->next()) {
           int32_t doc = postings->doc();
			  //Check if there are deletions
			  if (docMap != NULL)
				  doc = docMap[doc]; // map around deletions
           doc += base;                              // convert to merged space

            //Condition check to see doc is eaqual to or bigger than lastDoc
            CND_CONDITION(doc >= lastDoc,"docs out of order");

			   //Increase the total frequency over all segments
            df++;

            if ((df % skipInterval) == 0) {
               bufferSkip(lastDoc);
            }

			  //Calculate a new docCode 
			  //use low bit to flag freq=1
            int32_t docCode = (doc - lastDoc) << 1;	  
            lastDoc = doc;

			  //Get the frequency of the Term
              int32_t freq = postings->freq();
              if (freq == 1){
                  //write doc & freq=1
                  freqOutput->writeVInt(docCode | 1);	  
              }else{
				  //write doc
                  freqOutput->writeVInt(docCode);	  
				  //write frequency in doc
                  freqOutput->writeVInt(freq);		  
              }
				  
			  int32_t lastPosition = 0;			  
			  // write position deltas
			  for (int32_t j = 0; j < freq; j++) {
				  //Get the next position
                  int32_t position = postings->nextPosition();
				  //Write the difference between position and the last position
                  proxOutput->writeVInt(position - lastPosition);			  
                  lastPosition = position;
             }
          }

          i++;
      }

      //Return total number of documents across all segments where term was found		
      return df;
  }

  void SegmentMerger::resetSkip(){
    skipBuffer->reset();
    lastSkipDoc = 0;
    lastSkipFreqPointer = freqOutput->getFilePointer();
    lastSkipProxPointer = proxOutput->getFilePointer();
  }

  void SegmentMerger::bufferSkip(int32_t doc){
    int64_t freqPointer = freqOutput->getFilePointer();
    int64_t proxPointer = proxOutput->getFilePointer();

    skipBuffer->writeVInt(doc - lastSkipDoc);
    skipBuffer->writeVInt((int32_t) (freqPointer - lastSkipFreqPointer));
    skipBuffer->writeVInt((int32_t) (proxPointer - lastSkipProxPointer));

    lastSkipDoc = doc;
    lastSkipFreqPointer = freqPointer;
    lastSkipProxPointer = proxPointer;
  }

  int64_t SegmentMerger::writeSkip(){
    int64_t skipPointer = freqOutput->getFilePointer();
    skipBuffer->writeTo(freqOutput);
    return skipPointer;
  }

  void SegmentMerger::mergeNorms() {
  //Func - Merges the norms for all fields 
  //Pre  - fieldInfos != NULL
  //Post - The norms for all fields have been merged

      CND_PRECONDITION(fieldInfos != NULL, "fieldInfos is NULL");

	  IndexReader* reader  = NULL;
	  IndexOutput*  output  = NULL;

	  //iterate through all the Field Infos instances
      for (int32_t i = 0; i < fieldInfos->size(); i++) {
		  //Get the i-th FieldInfo
          FieldInfo* fi = fieldInfos->fieldInfo(i);
		  //Is this Field indexed?
          if (fi->isIndexed){
			  //Create an new filename for the norm file
              const char* buf = Misc::segmentname(segment,".f", i);
			  //Instantiate  an IndexOutput to that norm file
              output = directory->createOutput( buf );

			  //Condition check to see if output points to a valid instance
              CND_CONDITION(output != NULL, "No Outputstream retrieved");

              //Destroy the buffer of the filename
              _CLDELETE_CaARRAY( buf );
              
			  try{
				  //Iterate throug all SegmentReaders
                  for (uint32_t j = 0; j < readers.size(); j++) {
					  //Get the i-th IndexReader
                      reader = readers[j];

                      //Condition check to see if reader points to a valid instance
                      CND_CONDITION(reader != NULL, "No reader found");

					  //Get an IndexInput to the norm file for this field in this segment
                      uint8_t* input = reader->norms(fi->name);

					  //Get the total number of documents including the documents that have been marked deleted
                      int32_t maxDoc = reader->maxDoc();
						  //Iterate through all the documents
                          for(int32_t k = 0; k < maxDoc; k++) {
                              //Get the norm
							  //Note that the byte must always be read especially when the document
							  //is marked deleted to remain in sync
                              uint8_t norm = input != NULL ? input[k] : 0;

                              //Check if document k is deleted
						      if (!reader->isDeleted(k)){
								  //write the new norm
                                  output->writeByte(norm);
                                  }
                              }
                          } 
                      
                  }
              _CLFINALLY(
				  if (output != NULL){
				      //Close the IndexOutput output
                      output->close();
			          //destroy it
                      _CLDELETE(output);
				      }
				  );
			  }
		 }
  }

CL_NS_END
上一页 12
💿 文件大小 2052 K
👤 上传用户 managerliu123
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#clucene #lucene #stl #检索
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -