📄 documentwriter.cpp
字号:
/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"
#include "DocumentWriter.h"
#include "FieldInfos.h"
#include "IndexWriter.h"
#include "FieldsWriter.h"
#include "Term.h"
#include "TermInfo.h"
#include "TermInfosWriter.h"
#include "CLucene/analysis/AnalysisHeader.h"
#include "CLucene/search/Similarity.h"
#include "TermInfosWriter.h"
#include "FieldsWriter.h"
CL_NS_USE(util)
CL_NS_USE(store)
CL_NS_USE(analysis)
CL_NS_USE(document)
CL_NS_DEF(index)
/*Posting*/
int32_t Posting::getPositionsLength() const{
return positionsLength;
}
Posting::Posting(Term* t, const int32_t position)
{
//Func - Constructor
//Pre - t contains a valid reference to a Term
//Post - Instance has been created
freq = 1;
term = _CL_POINTER(t);
positions = (int32_t*)malloc(sizeof(int32_t));
positionsLength = 1;
positions[0] = position;
}
Posting::~Posting(){
//Func - Destructor
//Pre - true
//Post - The instance has been destroyed
free(positions);
_CLDECDELETE(term);
}
DocumentWriter::DocumentWriter(Directory* d, Analyzer* a, CL_NS(search)::Similarity* sim, const int32_t mfl):
analyzer(a),
directory(d),
maxFieldLength(mfl),
fieldInfos(NULL),
fieldLengths(NULL),
similarity(sim),
fieldPositions(NULL),
fieldBoosts(NULL),
termBuffer(_CLNEW Term( LUCENE_BLANK_STRING, LUCENE_BLANK_STRING )){
//Pre - d contains a valid reference to a Directory
// d contains a valid reference to a Analyzer
// mfl > 0 and contains the maximum field length
//Post - Instance has been created
CND_PRECONDITION(((mfl > 0) || (mfl == IndexWriter::FIELD_TRUNC_POLICY__WARN)),
"mfl is 0 or smaller than IndexWriter::FIELD_TRUNC_POLICY__WARN")
fieldInfos = NULL;
fieldLengths = NULL;
}
DocumentWriter::~DocumentWriter(){
//Func - Destructor
//Pre - true
//Post - The instance has been destroyed
clearPostingTable();
_CLDELETE( fieldInfos );
_CLDELETE_ARRAY(fieldLengths );
_CLDELETE_ARRAY(fieldPositions );
_CLDELETE_ARRAY(fieldBoosts );
_CLDECDELETE(termBuffer);
}
void DocumentWriter::clearPostingTable(){
CL_NS(util)::CLHashtable<Term*,Posting*,Term::Compare,Term::Equals>::iterator itr = postingTable.begin();
while ( itr != postingTable.end() ){
_CLDELETE(itr->second);
_CLLDECDELETE(itr->first);
++itr;
}
postingTable.clear();
}
void DocumentWriter::addDocument(const char* segment, Document* doc) {
// write field names
fieldInfos = _CLNEW FieldInfos();
fieldInfos->add(doc);
const char* buf = Misc::segmentname(segment, ".fnm");
fieldInfos->write(directory, buf);
_CLDELETE_CaARRAY(buf);
// write field values
FieldsWriter fieldsWriter(directory, segment, fieldInfos);
try {
fieldsWriter.addDocument(doc);
} _CLFINALLY( fieldsWriter.close() );
// invert doc into postingTable
clearPostingTable(); // clear postingTable
fieldLengths = _CL_NEWARRAY(int32_t,fieldInfos->size()); // init fieldLengths
fieldPositions = _CL_NEWARRAY(int32_t,fieldInfos->size()); // init fieldPositions
//initialise fieldBoost array with default boost
int32_t fbl = fieldInfos->size();
float_t fbd = doc->getBoost();
fieldBoosts = _CL_NEWARRAY(float_t,fbl); // init fieldBoosts
{ //msvc6 scope fix
for ( int32_t i=0;i<fbl;i++ )
fieldBoosts[i] = fbd;
}
{ //msvc6 scope fix
for ( int32_t i=0;i<fieldInfos->size();i++ )
fieldLengths[i] = 0;
} //msvc6 scope fix
invertDocument(doc);
// sort postingTable into an array
Posting** postings = NULL;
int32_t postingsLength = 0;
sortPostingTable(postings,postingsLength);
//DEBUG:
/*for (int32_t i = 0; i < postingsLength; i++) {
Posting* posting = postings[i];
TCHAR* b = posting->term->toString();
_cout << b << " freq=" << posting->freq;
_CLDELETE(b);
_cout << " pos=" << posting->positions[0];
for (int32_t j = 1; j < posting->freq; j++)
_cout <<"," << posting->positions[j];
_cout << endl;
}*/
// write postings
writePostings(postings,postingsLength, segment);
// write norms of indexed fields
writeNorms(doc, segment);
_CLDELETE_ARRAY( postings );
}
void DocumentWriter::sortPostingTable(Posting**& Array, int32_t& arraySize) {
// copy postingTable into an array
arraySize = postingTable.size();
Array = _CL_NEWARRAY(Posting*,arraySize);
CL_NS(util)::CLHashtable<Term*,Posting*,Term::Compare,Term::Equals>::iterator postings = postingTable.begin();
int32_t i=0;
while ( postings != postingTable.end() ){
Array[i] = postings->second;
postings++;
i++;
}
// sort the array
quickSort(Array, 0, i - 1);
}
void DocumentWriter::invertDocument(const Document* doc) {
DocumentFieldEnumeration* fields = doc->fields();
try {
while (fields->hasMoreElements()) {
Field* field = (Field*)fields->nextElement();
const TCHAR* fieldName = field->name();
const int32_t fieldNumber = fieldInfos->fieldNumber(fieldName);
int32_t length = fieldLengths[fieldNumber]; // length of field
int32_t position = fieldLengths[fieldNumber]; // position in field
if (field->isIndexed()) {
if (!field->isTokenized()) { // un-tokenized field
//FEATURE: this is bug in java: if using a Reader, then
//field value will not be added. With CLucene, an untokenized
//field with a reader will still be added
if (field->stringValue() == NULL) {
CL_NS(util)::Reader* r = field->readerValue();
// this call tries to read the entire stream
// this may invalidate the string for the further calls
// it may be better to do this via a FilterReader
// TODO make a better implementation of this
const TCHAR* charBuf;
int64_t dataLen = r->read(charBuf, LUCENE_INT32_MAX_SHOULDBE);
if (dataLen == -1)
dataLen = 0;
//todo: would be better to pass the string length, in case
//a null char is passed, but then would need to test the output too.
addPosition(fieldName, charBuf, position++);
} else {
addPosition(fieldName, field->stringValue(), position++);
}
length++;
} else { // field must be tokenized
CL_NS(util)::Reader* reader; // find or make Reader
bool delReader = false;
if (field->readerValue() != NULL) {
reader = field->readerValue();
} else if (field->stringValue() != NULL) {
reader = _CLNEW CL_NS(util)::StringReader(field->stringValue(),_tcslen(field->stringValue()),false);
delReader = true;
} else {
_CLTHROWA(CL_ERR_IO,"field must have either String or Reader value");
}
try {
// Tokenize field and add to postingTable.
CL_NS(analysis)::TokenStream* stream = analyzer->tokenStream(fieldName, reader);
try {
CL_NS(analysis)::Token t;
while (stream->next(&t)) {
position += (t.getPositionIncrement() - 1);
addPosition(fieldName, t.termText(), position++);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -