📄 documentwriter.cpp

📁 clucene是c++版的全文检索引擎,完全移植于lucene,采用 stl 编写.
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
                    length++;
                    // Apply field truncation policy.
					if (maxFieldLength != IndexWriter::FIELD_TRUNC_POLICY__WARN) {
                      // The client programmer has explicitly authorized us to
                      // truncate the token stream after maxFieldLength tokens.
                      if ( length > maxFieldLength) {
                        break;
                      }
					} else if (length > IndexWriter::DEFAULT_MAX_FIELD_LENGTH) {
                      const TCHAR* errMsgBase = 
                        _T("Indexing a huge number of tokens from a single")
                        _T(" field (\"%s\", in this case) can cause CLucene")
                        _T(" to use memory excessively.")
                        _T("  By default, CLucene will accept only %s tokens")
                        _T(" tokens from a single field before forcing the")
                        _T(" client programmer to specify a threshold at")
                        _T(" which to truncate the token stream.")
                        _T("  You should set this threshold via")
						      _T(" IndexReader::maxFieldLength (set to LUCENE_INT32_MAX")
                        _T(" to disable truncation, or a value to specify maximum number of fields).");
                      
                      TCHAR defaultMaxAsChar[34];
                      _i64tot(IndexWriter::DEFAULT_MAX_FIELD_LENGTH,
                          defaultMaxAsChar, 10
                        );
					         int32_t errMsgLen = _tcslen(errMsgBase)
                          + _tcslen(fieldName)
                          + _tcslen(defaultMaxAsChar);
                      TCHAR* errMsg = _CL_NEWARRAY(TCHAR,errMsgLen+1);

                      _sntprintf(errMsg, errMsgLen,errMsgBase, fieldName, defaultMaxAsChar);

					  _CLTHROWT_DEL(CL_ERR_Runtime,errMsg);
                    }
                } // while token->next

              } _CLFINALLY (
                stream->close();
                _CLDELETE(stream);
              );
            } _CLFINALLY (
              if (delReader) {
                _CLDELETE(reader);
              }
            );
          } // if/else field is to be tokenized
          fieldLengths[fieldNumber] = length; // save field length
          fieldPositions[fieldNumber] = position;	  // save field position
          fieldBoosts[fieldNumber] *= field->getBoost();

        } // if field is to beindexed
      } // while more fields available
    } _CLFINALLY (
      _CLDELETE(fields);
    );
  } // Document:;invertDocument
 
 
	void DocumentWriter::addPosition(const TCHAR* field,
                                         const TCHAR* text,
                                         const int32_t position) {
        
		termBuffer->set(field,text);

		Posting* ti = postingTable.get(termBuffer);
		if (ti != NULL) {				  // word seen before
			int32_t freq = ti->freq;
			if (ti->positionsLength == freq) {	  // positions array is full
				ti->positionsLength*=2;
				ti->positions = (int32_t*)realloc(ti->positions, sizeof(int32_t) * ti->positionsLength);
			}
			ti->positions[freq] = position;		  // add new position
			ti->freq++;			  // update frequency
		} else {					  // word not seen before
			Term* term = _CLNEW Term( field, text);
			postingTable.put(term, _CLNEW Posting(term, position));
		}
	}

	//static
	void DocumentWriter::quickSort(Posting**& postings, const int32_t lo, const int32_t hi) {
		if(lo >= hi)
			return;

		int32_t mid = (lo + hi) / 2;

		if(postings[lo]->term->compareTo(postings[mid]->term) > 0) {
			 Posting* tmp = postings[lo];
			postings[lo] = postings[mid];
			postings[mid] = tmp;
		}

		if(postings[mid]->term->compareTo(postings[hi]->term) > 0) {
			Posting* tmp = postings[mid];
			postings[mid] = postings[hi];
			postings[hi] = tmp;
		      
			if(postings[lo]->term->compareTo(postings[mid]->term) > 0) {
				Posting* tmp2 = postings[lo];
				postings[lo] = postings[mid];
				postings[mid] = tmp2;
			}
		}

		int32_t left = lo + 1;
		int32_t right = hi - 1;

		if (left >= right)
			return; 

		const Term* partition = postings[mid]->term; //not kept, so no need to finalize
	    
		for( ;; ) {
			while(postings[right]->term->compareTo(partition) > 0)
			--right;
		      
			while(left < right && postings[left]->term->compareTo(partition) <= 0)
				++left;
			      
			if(left < right) {
				Posting* tmp = postings[left];
				postings[left] = postings[right];
				postings[right] = tmp;
				--right;
			} else {
				break;
			}
		}
	
		quickSort(postings, lo, left);
		quickSort(postings, left + 1, hi);
	}

    void DocumentWriter::writePostings(Posting** postings, const int32_t postingsLength, const char* segment){
		#define __DOCLOSE(obj) if(obj!=NULL){ try{ obj->close(); _CLDELETE(obj);} catch(CLuceneError &e){ierr=e.number();err=e.what();} catch(...){err="Unknown error while closing posting tables";} }
		IndexOutput* freq = NULL;
		IndexOutput* prox = NULL;
		TermInfosWriter* tis = NULL;
		TermVectorsWriter* termVectorWriter = NULL;
		try {
         //open files for inverse index storage
		   const char* buf = Misc::segmentname( segment, ".frq");
			freq = directory->createOutput( buf );
			_CLDELETE_CaARRAY( buf );
			
			buf = Misc::segmentname( segment, ".prx");
			prox = directory->createOutput( buf );
			_CLDELETE_CaARRAY( buf );

			tis = _CLNEW TermInfosWriter(directory, segment, fieldInfos);
			TermInfo* ti = _CLNEW TermInfo();
			const TCHAR* currentField = NULL;
			for (int32_t i = 0; i < postingsLength; i++) {
				const Posting* posting = postings[i];

				// add an entry to the dictionary with pointers to prox and freq files
				ti->set(1, freq->getFilePointer(), prox->getFilePointer(), -1);
				tis->add(posting->term, ti);
				
				// add an entry to the freq file
				int32_t postingFreq = posting->freq;
				if (postingFreq == 1)				  // optimize freq=1
					freq->writeVInt(1);			  // set low bit of doc num.
				else {
					freq->writeVInt(0);			  // the document number
					freq->writeVInt(postingFreq);			  // frequency in doc
				}
				
				int32_t lastPosition = 0;			  // write positions
				int32_t* positions = posting->positions;
				for (int32_t j = 0; j < postingFreq; j++) {		  // use delta-encoding
					prox->writeVInt(positions[j] - lastPosition);
					lastPosition = positions[j];
				}

            // check to see if we switched to a new field
            const TCHAR* termField = posting->term->field();
            if ( currentField == NULL || _tcscmp(currentField,termField) != 0 ) { //todo, can we do an intern'd check?
               // changing field - see if there is something to save
               currentField = termField;
               FieldInfo* fi = fieldInfos->fieldInfo(currentField);
               if (fi->storeTermVector) {
                  if (termVectorWriter == NULL) {
                     termVectorWriter =
                        _CLNEW TermVectorsWriter(directory, segment, fieldInfos);
                     termVectorWriter->openDocument();
                  }
                  termVectorWriter->openField(currentField);
               } else if (termVectorWriter != NULL) {
                  termVectorWriter->closeField();
               }
            }
            if (termVectorWriter != NULL && termVectorWriter->isFieldOpen()) {
               termVectorWriter->addTerm(posting->term->text(), postingFreq);
            }
			}
         if (termVectorWriter != NULL)
            termVectorWriter->closeDocument();
			_CLDELETE(ti);
        }_CLFINALLY ( 
            const char* err=NULL;
			int32_t ierr=0;

            // make an effort to close all streams we can but remember and re-throw
            // the first exception encountered in this process
            __DOCLOSE(freq);
            __DOCLOSE(prox);
            __DOCLOSE(tis);
            __DOCLOSE(termVectorWriter);
            if ( err != NULL )
               _CLTHROWA(ierr,err);
        );
	}

	void DocumentWriter::writeNorms(const Document* doc, const char* segment) {
      char fn[CL_MAX_PATH];
      for(int32_t n = 0; n < fieldInfos->size(); n++){
         FieldInfo* fi = fieldInfos->fieldInfo(n);
         if(fi->isIndexed){
            float_t norm = fieldBoosts[n] * similarity->lengthNorm(fi->name, fieldLengths[n]);

            _snprintf(fn,CL_MAX_PATH,"%s.f%d",segment,n);
            IndexOutput* norms = directory->createOutput(fn);
            try {
               norms->writeByte(CL_NS(search)::Similarity::encodeNorm(norm));
				}_CLFINALLY ( 
				    norms->close();
				    _CLDELETE(norms);
            )
         }
      }
   }
CL_NS_END
上一页 12
💿 文件大小 2052 K
👤 上传用户 managerliu123
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#clucene #lucene #stl #检索
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -