📄 documentwriter.cpp

📁 clucene是c++版的全文检索引擎,完全移植于lucene,采用 stl 编写.
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"

#include "DocumentWriter.h"
#include "FieldInfos.h"
#include "IndexWriter.h"
#include "FieldsWriter.h"
#include "Term.h"
#include "TermInfo.h"
#include "TermInfosWriter.h"

#include "CLucene/analysis/AnalysisHeader.h"

#include "CLucene/search/Similarity.h"
#include "TermInfosWriter.h"
#include "FieldsWriter.h"

CL_NS_USE(util)
CL_NS_USE(store)
CL_NS_USE(analysis)
CL_NS_USE(document)
CL_NS_DEF(index)

    /*Posting*/
	int32_t Posting::getPositionsLength() const{
		return positionsLength;
	}
	  
	Posting::Posting(Term* t, const int32_t position)
	{
    //Func - Constructor
    //Pre  - t contains a valid reference to a Term
    //Post - Instance has been created
    	freq = 1;
		
		term = _CL_POINTER(t);
		positions = (int32_t*)malloc(sizeof(int32_t));
		positionsLength = 1;
		positions[0] = position;
	}
	Posting::~Posting(){
    //Func - Destructor
    //Pre  - true
    //Post - The instance has been destroyed
  	 
  	    free(positions);
		_CLDECDELETE(term);
	}



	DocumentWriter::DocumentWriter(Directory* d, Analyzer* a, CL_NS(search)::Similarity* sim, const int32_t mfl):
			analyzer(a),
			directory(d),
			maxFieldLength(mfl),
			fieldInfos(NULL),
			fieldLengths(NULL),
			similarity(sim),
			fieldPositions(NULL),
			fieldBoosts(NULL),
			termBuffer(_CLNEW Term( LUCENE_BLANK_STRING, LUCENE_BLANK_STRING )){
    //Pre  - d contains a valid reference to a Directory
    //       d contains a valid reference to a Analyzer
    //       mfl > 0 and contains the maximum field length
    //Post - Instance has been created
  	 
  CND_PRECONDITION(((mfl > 0) || (mfl == IndexWriter::FIELD_TRUNC_POLICY__WARN)),
     "mfl is 0 or smaller than IndexWriter::FIELD_TRUNC_POLICY__WARN")
  	 
  	   fieldInfos     = NULL;
  	   fieldLengths   = NULL;
	}

	DocumentWriter::~DocumentWriter(){
    //Func - Destructor
    //Pre  - true
    //Post - The instance has been destroyed
		clearPostingTable();
		_CLDELETE( fieldInfos );
		_CLDELETE_ARRAY(fieldLengths );
		_CLDELETE_ARRAY(fieldPositions );
		_CLDELETE_ARRAY(fieldBoosts );

		_CLDECDELETE(termBuffer);
	}
	
	void DocumentWriter::clearPostingTable(){
		CL_NS(util)::CLHashtable<Term*,Posting*,Term::Compare,Term::Equals>::iterator itr = postingTable.begin();
		while ( itr != postingTable.end() ){
			_CLDELETE(itr->second);
			_CLLDECDELETE(itr->first);

			++itr;
		}
		postingTable.clear();
	}

	void DocumentWriter::addDocument(const char* segment, Document* doc) {
		// write field names
		fieldInfos = _CLNEW FieldInfos();
		fieldInfos->add(doc);
		
		const char* buf = Misc::segmentname(segment, ".fnm");
		fieldInfos->write(directory, buf);
		_CLDELETE_CaARRAY(buf);

		// write field values
		FieldsWriter fieldsWriter(directory, segment, fieldInfos);
		try {
			fieldsWriter.addDocument(doc);
		} _CLFINALLY( fieldsWriter.close() );
	      
		// invert doc into postingTable
		clearPostingTable();			  // clear postingTable
		fieldLengths = _CL_NEWARRAY(int32_t,fieldInfos->size());	  // init fieldLengths
		fieldPositions = _CL_NEWARRAY(int32_t,fieldInfos->size());  // init fieldPositions
	      
		//initialise fieldBoost array with default boost
		int32_t fbl = fieldInfos->size();
		float_t fbd = doc->getBoost();
		fieldBoosts = _CL_NEWARRAY(float_t,fbl);	  // init fieldBoosts
		{ //msvc6 scope fix
			for ( int32_t i=0;i<fbl;i++ )
				fieldBoosts[i] = fbd;
		}

		{ //msvc6 scope fix
			for ( int32_t i=0;i<fieldInfos->size();i++ )
				fieldLengths[i] = 0;
		} //msvc6 scope fix
		invertDocument(doc);

		// sort postingTable into an array
		Posting** postings = NULL;
		int32_t postingsLength = 0;
		sortPostingTable(postings,postingsLength);

		//DEBUG:
		/*for (int32_t i = 0; i < postingsLength; i++) {
			Posting* posting = postings[i];
			
			TCHAR* b = posting->term->toString();
			_cout << b << " freq=" << posting->freq;
			_CLDELETE(b);

			_cout << " pos=" << posting->positions[0];
			for (int32_t j = 1; j < posting->freq; j++)
				_cout <<"," << posting->positions[j];
			
			_cout << endl;
		}*/


		// write postings
		writePostings(postings,postingsLength, segment);

		// write norms of indexed fields
		writeNorms(doc, segment);
		_CLDELETE_ARRAY( postings );
	}

	void DocumentWriter::sortPostingTable(Posting**& Array, int32_t& arraySize) {
		// copy postingTable into an array
		arraySize = postingTable.size();
		Array = _CL_NEWARRAY(Posting*,arraySize);
		CL_NS(util)::CLHashtable<Term*,Posting*,Term::Compare,Term::Equals>::iterator postings = postingTable.begin();
		int32_t i=0;
		while ( postings != postingTable.end() ){
			Array[i] = postings->second;
			postings++;
			i++;
		}
		// sort the array
		quickSort(Array, 0, i - 1);
	}


  void DocumentWriter::invertDocument(const Document* doc) {
    DocumentFieldEnumeration* fields = doc->fields();
    try {
       while (fields->hasMoreElements()) {
        Field* field = (Field*)fields->nextElement();
        const TCHAR* fieldName = field->name();
        const int32_t fieldNumber = fieldInfos->fieldNumber(fieldName);
        int32_t length = fieldLengths[fieldNumber];     // length of field
        int32_t position = fieldLengths[fieldNumber]; // position in field
        if (field->isIndexed()) {
        if (!field->isTokenized()) { // un-tokenized field
            //FEATURE: this is bug in java: if using a Reader, then
            //field value will not be added. With CLucene, an untokenized
            //field with a reader will still be added
           if (field->stringValue() == NULL) {
              CL_NS(util)::Reader* r = field->readerValue();
                // this call tries to read the entire stream
                // this may invalidate the string for the further calls
                // it may be better to do this via a FilterReader
                // TODO make a better implementation of this
                const TCHAR* charBuf;
                int64_t dataLen = r->read(charBuf, LUCENE_INT32_MAX_SHOULDBE);
                if (dataLen == -1)
                  dataLen = 0;
				//todo: would be better to pass the string length, in case
				//a null char is passed, but then would need to test the output too.
                addPosition(fieldName, charBuf, position++);
            } else {
				addPosition(fieldName, field->stringValue(), position++);
            }
            length++;
        } else { // field must be tokenized
            CL_NS(util)::Reader* reader; // find or make Reader
            bool delReader = false;
            if (field->readerValue() != NULL) {
              reader = field->readerValue();
            } else if (field->stringValue() != NULL) {
              reader = _CLNEW CL_NS(util)::StringReader(field->stringValue(),_tcslen(field->stringValue()),false);
              delReader = true;
            } else {
              _CLTHROWA(CL_ERR_IO,"field must have either String or Reader value");
            }

            try {
              // Tokenize field and add to postingTable.
              CL_NS(analysis)::TokenStream* stream = analyzer->tokenStream(fieldName, reader);

              try {
                CL_NS(analysis)::Token t;
                while (stream->next(&t)) {
                    position += (t.getPositionIncrement() - 1);
                    addPosition(fieldName, t.termText(), position++);
12 下一页
💿 文件大小 2052 K
👤 上传用户 managerliu123
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#clucene #lucene #stl #检索
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -