📄 segmentmerger.cpp
字号:
//for the segment name segment using the new merged fieldInfos
termInfosWriter = _CLNEW TermInfosWriter(directory, segment, fieldInfos);
//Condition check to see if termInfosWriter points to a valid instance
CND_CONDITION(termInfosWriter != NULL,"Memory allocation for termInfosWriter failed") ;
skipInterval = termInfosWriter->skipInterval;
queue = _CLNEW SegmentMergeQueue(readers.size());
//And merge the Term Infos
mergeTermInfos();
}_CLFINALLY(
//Close and destroy the IndexOutput to the Frequency File
if (freqOutput != NULL) { freqOutput->close(); _CLDELETE(freqOutput); }
//Close and destroy the IndexOutput to the Prox File
if (proxOutput != NULL) { proxOutput->close(); _CLDELETE(proxOutput); }
//Close and destroy the termInfosWriter
if (termInfosWriter != NULL) { termInfosWriter->close(); _CLDELETE(termInfosWriter); }
//Close and destroy the queue
if (queue != NULL) { queue->close(); _CLDELETE(queue);}
);
}
void SegmentMerger::mergeTermInfos(){
//Func - Merges all TermInfos into a single segment
//Pre - true
//Post - All TermInfos have been merged into a single segment
//Condition check to see if queue points to a valid instance
CND_CONDITION(queue != NULL, "Memory allocation for queue failed") ;
//base is the id of the first document in a segment
int32_t base = 0;
IndexReader* reader = NULL;
SegmentMergeInfo* smi = NULL;
//iterate through all the readers
for (uint32_t i = 0; i < readers.size(); i++) {
//Get the i-th reader
reader = readers[i];
//Condition check to see if reader points to a valid instance
CND_CONDITION(reader != NULL, "No SegmentReader found");
//Get the term enumeration of the reader
TermEnum* termEnum = reader->terms();
//Instantiate a new SegmentMerginfo for the current reader and enumeration
smi = _CLNEW SegmentMergeInfo(base, termEnum, reader);
//Condition check to see if smi points to a valid instance
CND_CONDITION(smi != NULL, "Memory allocation for smi failed") ;
//Increase the base by the number of documents that have not been marked deleted
//so base will contain a new value for the first document of the next iteration
base += reader->numDocs();
//Get the next current term
if (smi->next()){
//Store the SegmentMergeInfo smi with the initialized SegmentTermEnum TermEnum
//into the queue
queue->put(smi);
}else{
//Apparently the end of the TermEnum of the SegmentTerm has been reached so
//close the SegmentMergeInfo smi
smi->close();
//And destroy the instance and set smi to NULL (It will be used later in this method)
_CLDELETE(smi);
}
}
//Instantiate an array of SegmentMergeInfo instances called match
SegmentMergeInfo** match = _CL_NEWARRAY(SegmentMergeInfo*,readers.size()+1);
//Condition check to see if match points to a valid instance
CND_CONDITION(match != NULL, "Memory allocation for match failed") ;
SegmentMergeInfo* top = NULL;
//As long as there are SegmentMergeInfo instances stored in the queue
while (queue->size() > 0) {
int32_t matchSize = 0;
// pop matching terms
//Pop the first SegmentMergeInfo from the queue
match[matchSize++] = queue->pop();
//Get the Term of match[0]
Term* term = match[0]->term;
//Condition check to see if term points to a valid instance
CND_CONDITION(term != NULL,"term is NULL") ;
//Get the current top of the queue
top = queue->top();
//For each SegmentMergInfo still in the queue
//Check if term matches the term of the SegmentMergeInfo instances in the queue
while (top != NULL && term->equals(top->term) ){ //todo: changed to equals, but check if this is more efficient
//A match has been found so add the matching SegmentMergeInfo to the match array
match[matchSize++] = queue->pop();
//Get the next SegmentMergeInfo
top = queue->top();
}
match[matchSize]=NULL;
//add new TermInfo
mergeTermInfo(match); //matchSize
//Restore the SegmentTermInfo instances in the match array back into the queue
while (matchSize > 0){
smi = match[--matchSize];
//Condition check to see if smi points to a valid instance
CND_CONDITION(smi != NULL,"smi is NULL") ;
//Move to the next term in the enumeration of SegmentMergeInfo smi
if (smi->next()){
//There still are some terms so restore smi in the queue
queue->put(smi);
}else{
//Done with a segment
//No terms anymore so close this SegmentMergeInfo instance
smi->close();
_CLDELETE( smi );
}
}
}
_CLDELETE_ARRAY(match);
}
void SegmentMerger::mergeTermInfo( SegmentMergeInfo** smis){
//Func - Merge the TermInfo of a term found in one or more segments.
//Pre - smis != NULL and it contains segments that are positioned at the same term.
// n is equal to the number of SegmentMergeInfo instances in smis
// freqOutput != NULL
// proxOutput != NULL
//Post - The TermInfo of a term has been merged
CND_PRECONDITION(smis != NULL, "smis is NULL");
CND_PRECONDITION(freqOutput != NULL, "freqOutput is NULL");
CND_PRECONDITION(proxOutput != NULL, "proxOutput is NULL");
//Get the file pointer of the IndexOutput to the Frequency File
int64_t freqPointer = freqOutput->getFilePointer();
//Get the file pointer of the IndexOutput to the Prox File
int64_t proxPointer = proxOutput->getFilePointer();
//Process postings from multiple segments all positioned on the same term.
int32_t df = appendPostings(smis);
int64_t skipPointer = writeSkip();
//df contains the number of documents across all segments where this term was found
if (df > 0) {
//add an entry to the dictionary with pointers to prox and freq files
termInfo.set(df, freqPointer, proxPointer, (int32_t)(skipPointer - freqPointer));
//Precondition check for to be sure that the reference to
//smis[0]->term will be valid
CND_PRECONDITION(smis[0]->term != NULL, "smis[0]->term is NULL");
//Write a new TermInfo
termInfosWriter->add(smis[0]->term, &termInfo);
}
}
int32_t SegmentMerger::appendPostings(SegmentMergeInfo** smis){
//Func - Process postings from multiple segments all positioned on the
// same term. Writes out merged entries into freqOutput and
// the proxOutput streams.
//Pre - smis != NULL and it contains segments that are positioned at the same term.
// n is equal to the number of SegmentMergeInfo instances in smis
// freqOutput != NULL
// proxOutput != NULL
//Post - Returns number of documents across all segments where this term was found
CND_PRECONDITION(smis != NULL, "smis is NULL");
CND_PRECONDITION(freqOutput != NULL, "freqOutput is NULL");
CND_PRECONDITION(proxOutput != NULL, "proxOutput is NULL");
int32_t lastDoc = 0;
int32_t df = 0; //Document Counter
resetSkip();
SegmentMergeInfo* smi = NULL;
//Iterate through all SegmentMergeInfo instances in smis
int32_t i = 0;
while ( (smi=smis[i]) != NULL ){
//Get the i-th SegmentMergeInfo
//Condition check to see if smi points to a valid instance
CND_PRECONDITION(smi!=NULL," is NULL");
//Get the term positions
TermPositions* postings = smi->postings;
//Get the base of this segment
int32_t base = smi->base;
//Get the docMap so we can see which documents have been deleted
int32_t* docMap = smi->docMap;
//Seek the termpost
postings->seek(smi->termEnum);
while (postings->next()) {
int32_t doc = postings->doc();
//Check if there are deletions
if (docMap != NULL)
doc = docMap[doc]; // map around deletions
doc += base; // convert to merged space
//Condition check to see doc is eaqual to or bigger than lastDoc
CND_CONDITION(doc >= lastDoc,"docs out of order");
//Increase the total frequency over all segments
df++;
if ((df % skipInterval) == 0) {
bufferSkip(lastDoc);
}
//Calculate a new docCode
//use low bit to flag freq=1
int32_t docCode = (doc - lastDoc) << 1;
lastDoc = doc;
//Get the frequency of the Term
int32_t freq = postings->freq();
if (freq == 1){
//write doc & freq=1
freqOutput->writeVInt(docCode | 1);
}else{
//write doc
freqOutput->writeVInt(docCode);
//write frequency in doc
freqOutput->writeVInt(freq);
}
int32_t lastPosition = 0;
// write position deltas
for (int32_t j = 0; j < freq; j++) {
//Get the next position
int32_t position = postings->nextPosition();
//Write the difference between position and the last position
proxOutput->writeVInt(position - lastPosition);
lastPosition = position;
}
}
i++;
}
//Return total number of documents across all segments where term was found
return df;
}
void SegmentMerger::resetSkip(){
skipBuffer->reset();
lastSkipDoc = 0;
lastSkipFreqPointer = freqOutput->getFilePointer();
lastSkipProxPointer = proxOutput->getFilePointer();
}
void SegmentMerger::bufferSkip(int32_t doc){
int64_t freqPointer = freqOutput->getFilePointer();
int64_t proxPointer = proxOutput->getFilePointer();
skipBuffer->writeVInt(doc - lastSkipDoc);
skipBuffer->writeVInt((int32_t) (freqPointer - lastSkipFreqPointer));
skipBuffer->writeVInt((int32_t) (proxPointer - lastSkipProxPointer));
lastSkipDoc = doc;
lastSkipFreqPointer = freqPointer;
lastSkipProxPointer = proxPointer;
}
int64_t SegmentMerger::writeSkip(){
int64_t skipPointer = freqOutput->getFilePointer();
skipBuffer->writeTo(freqOutput);
return skipPointer;
}
void SegmentMerger::mergeNorms() {
//Func - Merges the norms for all fields
//Pre - fieldInfos != NULL
//Post - The norms for all fields have been merged
CND_PRECONDITION(fieldInfos != NULL, "fieldInfos is NULL");
IndexReader* reader = NULL;
IndexOutput* output = NULL;
//iterate through all the Field Infos instances
for (int32_t i = 0; i < fieldInfos->size(); i++) {
//Get the i-th FieldInfo
FieldInfo* fi = fieldInfos->fieldInfo(i);
//Is this Field indexed?
if (fi->isIndexed){
//Create an new filename for the norm file
const char* buf = Misc::segmentname(segment,".f", i);
//Instantiate an IndexOutput to that norm file
output = directory->createOutput( buf );
//Condition check to see if output points to a valid instance
CND_CONDITION(output != NULL, "No Outputstream retrieved");
//Destroy the buffer of the filename
_CLDELETE_CaARRAY( buf );
try{
//Iterate throug all SegmentReaders
for (uint32_t j = 0; j < readers.size(); j++) {
//Get the i-th IndexReader
reader = readers[j];
//Condition check to see if reader points to a valid instance
CND_CONDITION(reader != NULL, "No reader found");
//Get an IndexInput to the norm file for this field in this segment
uint8_t* input = reader->norms(fi->name);
//Get the total number of documents including the documents that have been marked deleted
int32_t maxDoc = reader->maxDoc();
//Iterate through all the documents
for(int32_t k = 0; k < maxDoc; k++) {
//Get the norm
//Note that the byte must always be read especially when the document
//is marked deleted to remain in sync
uint8_t norm = input != NULL ? input[k] : 0;
//Check if document k is deleted
if (!reader->isDeleted(k)){
//write the new norm
output->writeByte(norm);
}
}
}
}
_CLFINALLY(
if (output != NULL){
//Close the IndexOutput output
output->close();
//destroy it
_CLDELETE(output);
}
);
}
}
}
CL_NS_END
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -