📄 documentwriter.cpp
字号:
length++;
// Apply field truncation policy.
if (maxFieldLength != IndexWriter::FIELD_TRUNC_POLICY__WARN) {
// The client programmer has explicitly authorized us to
// truncate the token stream after maxFieldLength tokens.
if ( length > maxFieldLength) {
break;
}
} else if (length > IndexWriter::DEFAULT_MAX_FIELD_LENGTH) {
const TCHAR* errMsgBase =
_T("Indexing a huge number of tokens from a single")
_T(" field (\"%s\", in this case) can cause CLucene")
_T(" to use memory excessively.")
_T(" By default, CLucene will accept only %s tokens")
_T(" tokens from a single field before forcing the")
_T(" client programmer to specify a threshold at")
_T(" which to truncate the token stream.")
_T(" You should set this threshold via")
_T(" IndexReader::maxFieldLength (set to LUCENE_INT32_MAX")
_T(" to disable truncation, or a value to specify maximum number of fields).");
TCHAR defaultMaxAsChar[34];
_i64tot(IndexWriter::DEFAULT_MAX_FIELD_LENGTH,
defaultMaxAsChar, 10
);
int32_t errMsgLen = _tcslen(errMsgBase)
+ _tcslen(fieldName)
+ _tcslen(defaultMaxAsChar);
TCHAR* errMsg = _CL_NEWARRAY(TCHAR,errMsgLen+1);
_sntprintf(errMsg, errMsgLen,errMsgBase, fieldName, defaultMaxAsChar);
_CLTHROWT_DEL(CL_ERR_Runtime,errMsg);
}
} // while token->next
} _CLFINALLY (
stream->close();
_CLDELETE(stream);
);
} _CLFINALLY (
if (delReader) {
_CLDELETE(reader);
}
);
} // if/else field is to be tokenized
fieldLengths[fieldNumber] = length; // save field length
fieldPositions[fieldNumber] = position; // save field position
fieldBoosts[fieldNumber] *= field->getBoost();
} // if field is to beindexed
} // while more fields available
} _CLFINALLY (
_CLDELETE(fields);
);
} // Document:;invertDocument
void DocumentWriter::addPosition(const TCHAR* field,
const TCHAR* text,
const int32_t position) {
termBuffer->set(field,text);
Posting* ti = postingTable.get(termBuffer);
if (ti != NULL) { // word seen before
int32_t freq = ti->freq;
if (ti->positionsLength == freq) { // positions array is full
ti->positionsLength*=2;
ti->positions = (int32_t*)realloc(ti->positions, sizeof(int32_t) * ti->positionsLength);
}
ti->positions[freq] = position; // add new position
ti->freq++; // update frequency
} else { // word not seen before
Term* term = _CLNEW Term( field, text);
postingTable.put(term, _CLNEW Posting(term, position));
}
}
//static
void DocumentWriter::quickSort(Posting**& postings, const int32_t lo, const int32_t hi) {
if(lo >= hi)
return;
int32_t mid = (lo + hi) / 2;
if(postings[lo]->term->compareTo(postings[mid]->term) > 0) {
Posting* tmp = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp;
}
if(postings[mid]->term->compareTo(postings[hi]->term) > 0) {
Posting* tmp = postings[mid];
postings[mid] = postings[hi];
postings[hi] = tmp;
if(postings[lo]->term->compareTo(postings[mid]->term) > 0) {
Posting* tmp2 = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp2;
}
}
int32_t left = lo + 1;
int32_t right = hi - 1;
if (left >= right)
return;
const Term* partition = postings[mid]->term; //not kept, so no need to finalize
for( ;; ) {
while(postings[right]->term->compareTo(partition) > 0)
--right;
while(left < right && postings[left]->term->compareTo(partition) <= 0)
++left;
if(left < right) {
Posting* tmp = postings[left];
postings[left] = postings[right];
postings[right] = tmp;
--right;
} else {
break;
}
}
quickSort(postings, lo, left);
quickSort(postings, left + 1, hi);
}
void DocumentWriter::writePostings(Posting** postings, const int32_t postingsLength, const char* segment){
#define __DOCLOSE(obj) if(obj!=NULL){ try{ obj->close(); _CLDELETE(obj);} catch(CLuceneError &e){ierr=e.number();err=e.what();} catch(...){err="Unknown error while closing posting tables";} }
IndexOutput* freq = NULL;
IndexOutput* prox = NULL;
TermInfosWriter* tis = NULL;
TermVectorsWriter* termVectorWriter = NULL;
try {
//open files for inverse index storage
const char* buf = Misc::segmentname( segment, ".frq");
freq = directory->createOutput( buf );
_CLDELETE_CaARRAY( buf );
buf = Misc::segmentname( segment, ".prx");
prox = directory->createOutput( buf );
_CLDELETE_CaARRAY( buf );
tis = _CLNEW TermInfosWriter(directory, segment, fieldInfos);
TermInfo* ti = _CLNEW TermInfo();
const TCHAR* currentField = NULL;
for (int32_t i = 0; i < postingsLength; i++) {
const Posting* posting = postings[i];
// add an entry to the dictionary with pointers to prox and freq files
ti->set(1, freq->getFilePointer(), prox->getFilePointer(), -1);
tis->add(posting->term, ti);
// add an entry to the freq file
int32_t postingFreq = posting->freq;
if (postingFreq == 1) // optimize freq=1
freq->writeVInt(1); // set low bit of doc num.
else {
freq->writeVInt(0); // the document number
freq->writeVInt(postingFreq); // frequency in doc
}
int32_t lastPosition = 0; // write positions
int32_t* positions = posting->positions;
for (int32_t j = 0; j < postingFreq; j++) { // use delta-encoding
prox->writeVInt(positions[j] - lastPosition);
lastPosition = positions[j];
}
// check to see if we switched to a new field
const TCHAR* termField = posting->term->field();
if ( currentField == NULL || _tcscmp(currentField,termField) != 0 ) { //todo, can we do an intern'd check?
// changing field - see if there is something to save
currentField = termField;
FieldInfo* fi = fieldInfos->fieldInfo(currentField);
if (fi->storeTermVector) {
if (termVectorWriter == NULL) {
termVectorWriter =
_CLNEW TermVectorsWriter(directory, segment, fieldInfos);
termVectorWriter->openDocument();
}
termVectorWriter->openField(currentField);
} else if (termVectorWriter != NULL) {
termVectorWriter->closeField();
}
}
if (termVectorWriter != NULL && termVectorWriter->isFieldOpen()) {
termVectorWriter->addTerm(posting->term->text(), postingFreq);
}
}
if (termVectorWriter != NULL)
termVectorWriter->closeDocument();
_CLDELETE(ti);
}_CLFINALLY (
const char* err=NULL;
int32_t ierr=0;
// make an effort to close all streams we can but remember and re-throw
// the first exception encountered in this process
__DOCLOSE(freq);
__DOCLOSE(prox);
__DOCLOSE(tis);
__DOCLOSE(termVectorWriter);
if ( err != NULL )
_CLTHROWA(ierr,err);
);
}
void DocumentWriter::writeNorms(const Document* doc, const char* segment) {
char fn[CL_MAX_PATH];
for(int32_t n = 0; n < fieldInfos->size(); n++){
FieldInfo* fi = fieldInfos->fieldInfo(n);
if(fi->isIndexed){
float_t norm = fieldBoosts[n] * similarity->lengthNorm(fi->name, fieldLengths[n]);
_snprintf(fn,CL_MAX_PATH,"%s.f%d",segment,n);
IndexOutput* norms = directory->createOutput(fn);
try {
norms->writeByte(CL_NS(search)::Similarity::encodeNorm(norm));
}_CLFINALLY (
norms->close();
_CLDELETE(norms);
)
}
}
}
CL_NS_END
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -