📄 segmentmerger.cpp
字号:
/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"
#include "SegmentMerger.h"
CL_NS_USE(util)
CL_NS_USE(document)
CL_NS_USE(store)
CL_NS_DEF(index)
// File extensions of old-style index files
const char* COMPOUND_EXTENSIONS="fnm\0" "frq\0" "prx\0" "fdx\0" "fdt\0" "tii\0" "tis\0";
int COMPOUND_EXTENSIONS_LENGTH=7;
const char* VECTOR_EXTENSIONS="tvx\0" "tvd\0" "tvf\0";
int VECTOR_EXTENSIONS_LENGTH=3;
SegmentMerger::SegmentMerger(Directory* dir, const char* name, const bool compoundFile): directory(dir){
//Func - Constructor
//Pre - dir holds a valid reference to a Directory
// name != NULL
//Post - Instance has been created
CND_PRECONDITION(name != NULL, "name is NULL");
freqOutput = NULL;
proxOutput = NULL;
termInfosWriter = NULL;
queue = NULL;
segment = STRDUP_AtoA(name);
fieldInfos = NULL;
useCompoundFile = compoundFile;
skipBuffer = _CLNEW CL_NS(store)::RAMIndexOutput();
lastSkipDoc=0;
lastSkipFreqPointer=0;
lastSkipProxPointer=0;
skipInterval=0;
}
SegmentMerger::~SegmentMerger(){
//Func - Destructor
//Pre - true
//Post - The instance has been destroyed
//Clear the readers set
readers.clear();
//Delete field Infos
_CLDELETE(fieldInfos);
//Close and destroy the IndexOutput to the Frequency File
if (freqOutput != NULL){
freqOutput->close();
_CLDELETE(freqOutput);
}
//Close and destroy the IndexOutput to the Prox File
if (proxOutput != NULL){
proxOutput->close();
_CLDELETE(proxOutput);
}
//Close and destroy the termInfosWriter
if (termInfosWriter != NULL){
termInfosWriter->close();
_CLDELETE(termInfosWriter);
}
//Close and destroy the queue
if (queue != NULL){
queue->close();
_CLDELETE(queue);
}
//close and destory the skipBuffer
if ( skipBuffer != NULL ){
skipBuffer->close();
_CLDELETE(skipBuffer);
}
_CLDELETE_CaARRAY(segment);
}
void SegmentMerger::add(SegmentReader* reader) {
//Func - Adds a SegmentReader to the set of readers
//Pre - reader contains a valid reference to a SegmentReader
//Post - The SegementReader reader has been added to the set of readers
readers.push_back(reader);
}
IndexReader* SegmentMerger::segmentReader(const int32_t i) {
//Func - Returns a reference to the i-th SegmentReader
//Pre - 0 <= i < readers.size()
//Post - A reference to the i-th SegmentReader has been returned
CND_PRECONDITION(i >= 0, "i is a negative number");
CND_PRECONDITION((size_t)i < readers.size(), "i is bigger than the number of SegmentReader instances");
//Retrieve the i-th SegmentReader
SegmentReader* ret = readers[i];
CND_CONDITION(ret != NULL,"No SegmentReader found");
return ret;
}
int32_t SegmentMerger::merge() {
int32_t value = mergeFields();
mergeTerms();
mergeNorms();
if (fieldInfos->hasVectors())
mergeVectors();
if (useCompoundFile)
createCompoundFile();
return value;
}
void SegmentMerger::closeReaders(){
for (uint32_t i = 0; i < readers.size(); i++) { // close readers
IndexReader* reader = readers[i];
reader->close();
}
}
void SegmentMerger::createCompoundFile(){
char name[CL_MAX_PATH];
_snprintf(name,CL_MAX_PATH,"%s.cfs",segment);
CompoundFileWriter* cfsWriter = _CLNEW CompoundFileWriter(directory, name);
char** files = _CL_NEWARRAY(char*, COMPOUND_EXTENSIONS_LENGTH + VECTOR_EXTENSIONS_LENGTH + fieldInfos->size());
int32_t fileslen = 0;
{ //msvc6 scope fix
// Basic files
for (int32_t i = 0; i < COMPOUND_EXTENSIONS_LENGTH; i++) {
files[fileslen]=Misc::ajoin(segment,".",COMPOUND_EXTENSIONS+(i*4));
fileslen++;
}
}
{ //msvc6 scope fix
// Field norm files
for (int32_t i = 0; i < fieldInfos->size(); i++) {
FieldInfo* fi = fieldInfos->fieldInfo(i);
if (fi->isIndexed) {
TCHAR tbuf[10];
char abuf[10];
_i64tot(i,tbuf,10);
STRCPY_TtoA(abuf,tbuf,10);
files[fileslen] = Misc::ajoin(segment,".f",abuf);
fileslen++;
}
}
}
// Vector files
if (fieldInfos->hasVectors()) {
for (int32_t i = 0; i < VECTOR_EXTENSIONS_LENGTH; i++) {
files[fileslen] = Misc::ajoin(segment, ".", VECTOR_EXTENSIONS+(i*4));
fileslen++;
}
}
{ //msvc6 scope fix
// Now merge all added files
for ( int32_t i=0;i<fileslen;i++ ){
cfsWriter->addFile(files[i]);
}
}
// Perform the merge
cfsWriter->close();
_CLDELETE(cfsWriter);
{ //msvc6 scope fix
// Now delete the source files
for ( int32_t i=0;i<fileslen;i++ ){
directory->deleteFile(files[i]);
_CLDELETE_LCaARRAY(files[i]);
}
}
_CLDELETE_ARRAY(files);
}
int32_t SegmentMerger::mergeFields() {
//Func - Merge the fields of all segments
//Pre - true
//Post - The field infos and field values of all segments have been merged.
//Create a new FieldInfos
fieldInfos = _CLNEW FieldInfos(); // merge field names
//Condition check to see if fieldInfos points to a valid instance
CND_CONDITION(fieldInfos != NULL,"Memory allocation for fieldInfos failed");
SegmentReader* reader = NULL;
int32_t docCount = 0;
//Iterate through all readers
for (uint32_t i = 0; i < readers.size(); i++){
//get the i-th reader
reader = readers[i];
//Condition check to see if reader points to a valid instance
CND_CONDITION(reader != NULL,"No SegmentReader found");
TCHAR** tmp = NULL;
tmp = reader->getIndexedFieldNames(true);
fieldInfos->add((const TCHAR**)tmp, true, true);
_CLDELETE_CARRAY_ALL(tmp);
tmp = reader->getIndexedFieldNames(false);
fieldInfos->add((const TCHAR**)tmp, true, false);
_CLDELETE_CARRAY_ALL(tmp);
tmp = reader->getFieldNames(false);
fieldInfos->add((const TCHAR**)tmp, false, false);
_CLDELETE_CARRAY_ALL(tmp);
}
//Create the filename of the new FieldInfos file
const char* buf = Misc::segmentname(segment,".fnm");
//Write the new FieldInfos file to the directory
fieldInfos->write(directory, buf );
//Destroy the buffer of the filename
_CLDELETE_CaARRAY(buf);
// merge field values
//Instantiate Fieldswriter which will write in directory for the segment name segment
//Using the new merged fieldInfos
FieldsWriter* fieldsWriter = _CLNEW FieldsWriter(directory, segment, fieldInfos);
//Condition check to see if fieldsWriter points to a valid instance
CND_CONDITION(fieldsWriter != NULL,"Memory allocation for fieldsWriter failed");
try {
IndexReader* reader = NULL;
//Iterate through all readers
int32_t j,maxDoc;
Document* doc;
for (uint32_t i = 0; i < readers.size(); ++i) {
//get the i-th reader
reader = (SegmentReader*)readers[i];
//Condition check to see if reader points to a valid instance
CND_CONDITION(reader != NULL, "No SegmentReader found");
//Get the total number documents including the documents that have been marked deleted
maxDoc = reader->maxDoc();
//Iterate through all the documents managed by the current reader
for (j = 0; j < maxDoc; ++j){
//Check if the j-th document has been deleted, if so skip it
if (!reader->isDeleted(j)){
//Get the document
doc = reader->document(j);
//Add the document to the new FieldsWriter
fieldsWriter->addDocument( doc );
++docCount;
//doc is not used anymore so have it deleted
_CLDELETE(doc);
}
}
}
}_CLFINALLY(
//Close the fieldsWriter
fieldsWriter->close();
//And have it deleted as it not used any more
_CLDELETE( fieldsWriter );
);
return docCount;
}
void SegmentMerger::mergeVectors(){
TermVectorsWriter* termVectorsWriter =
_CLNEW TermVectorsWriter(directory, segment, fieldInfos);
try {
for (uint32_t r = 0; r < readers.size(); r++) {
IndexReader* reader = readers[r];
int32_t maxDoc = reader->maxDoc();
for (int32_t docNum = 0; docNum < maxDoc; docNum++) {
// skip deleted docs
if (reader->isDeleted(docNum)) {
continue;
}
termVectorsWriter->openDocument();
// get all term vectors
TermFreqVector** sourceTermVector =
reader->getTermFreqVectors(docNum);
if (sourceTermVector != NULL) {
int32_t f = 0;
TermFreqVector* termVector=NULL;
while ( (termVector=sourceTermVector[f++]) != NULL ){
termVectorsWriter->openField(termVector->getField());
const TCHAR** terms = termVector->getTerms();
const int32_t* freqs = termVector->getTermFrequencies();
int32_t t = 0;
while ( terms[t] != NULL ){
termVectorsWriter->addTerm(terms[t], freqs[t]);
//todo: delete terms string return
t++;
}
_CLDELETE(termVector);
}
_CLDELETE_ARRAY(sourceTermVector);
}
termVectorsWriter->closeDocument();
}
}
}_CLFINALLY( _CLDELETE(termVectorsWriter); );
}
void SegmentMerger::mergeTerms() {
//Func - Merge the terms of all segments
//Pre - fieldInfos != NULL
//Post - The terms of all segments have been merged
CND_PRECONDITION(fieldInfos != NULL, "fieldInfos is NULL");
try{
//create a filename for the new Frequency File for segment
const char* buf = Misc::segmentname(segment,".frq");
//Open an IndexOutput to the new Frequency File
freqOutput = directory->createOutput( buf );
//Destroy the buffer of the filename
_CLDELETE_CaARRAY(buf);
//create a filename for the new Prox File for segment
buf = Misc::segmentname(segment,".prx");
//Open an IndexOutput to the new Prox File
proxOutput = directory->createOutput( buf );
//delete buffer
_CLDELETE_CaARRAY( buf );
//Instantiate a new termInfosWriter which will write in directory
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -