📄 segmentreader.cpp
字号:
/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"
#include "SegmentHeader.h"
#include "FieldInfos.h"
#include "FieldsReader.h"
#include "IndexReader.h"
#include "TermInfosReader.h"
#include "Terms.h"
CL_NS_USE(util)
CL_NS_USE(store)
CL_NS_USE(document)
CL_NS_DEF(index)
SegmentReader::Norm::Norm(IndexInput* instrm, int32_t n, SegmentReader* r, const char* seg):
in(instrm), number(n), reader(r), segment(seg){
//Func - Constructor
//Pre - instrm is a valid reference to an IndexInput
//Post - A Norm instance has been created with an empty bytes array
bytes = NULL;
dirty = false;
}
SegmentReader::Norm::~Norm() {
//Func - Destructor
//Pre - true
//Post - The IndexInput in has been deleted (and closed by its destructor)
// and the array too.
//Close and destroy the inputstream in-> The inputstream will be closed
// by its destructor. Note that the IndexInput 'in' actually is a pointer!!!!!
_CLDELETE(in);
//Delete the bytes array
_CLDELETE_ARRAY(bytes);
}
void SegmentReader::Norm::reWrite(){
char buf[CL_MAX_PATH];
char fileName[CL_MAX_PATH];
sprintf(buf,"%s.tmp",segment);
// NOTE: norms are re-written in regular directory, not cfs
IndexOutput* out = reader->getDirectory()->createOutput(buf);
try {
out->writeBytes(bytes, reader->maxDoc());
}_CLFINALLY( out->close(); _CLDELETE(out) );
sprintf(fileName,"%s.f%d",segment,number);
reader->getDirectory()->renameFile(buf, fileName);
this->dirty = false;
}
SegmentReader::SegmentReader(SegmentInfo* si) :
//Init the superclass IndexReader
IndexReader(si->getDir()),
_norms(false,false)
{
initialize(si);
}
SegmentReader::SegmentReader(SegmentInfos* sis, SegmentInfo* si) :
//Init the superclass IndexReader
IndexReader(si->getDir(),sis,false),
_norms(false,false)
{
initialize(si);
}
void SegmentReader::initialize(SegmentInfo* si){
//Pre - si-> is a valid reference to SegmentInfo instance
// identified by si->
//Post - All files of the segment have been read
deletedDocs = NULL;
//There are no documents yet marked as deleted
deletedDocsDirty = false;
normsDirty=false;
undeleteAll=false;
//Duplicate the name of the segment from SegmentInfo to segment
segment = STRDUP_AtoA(si->name);
// make sure that all index files have been read or are kept open
// so that if an index update removes them we'll still have them
freqStream = NULL;
proxStream = NULL;
//instantiate a buffer large enough to hold a directory path
char buf[CL_MAX_PATH];
// Use compound file directory for some files, if it exists
Directory* cfsDir = getDirectory();
SegmentName(buf, CL_MAX_PATH, ".cfs");
if (cfsDir->fileExists(buf)) {
cfsReader = _CLNEW CompoundFileReader(cfsDir, buf);
cfsDir = cfsReader;
}else
cfsReader = NULL;
//Create the name of the field info file with suffix .fnm in buf
SegmentName(buf, CL_MAX_PATH, ".fnm");
fieldInfos = _CLNEW FieldInfos(cfsDir, buf );
//Condition check to see if fieldInfos points to a valid instance
CND_CONDITION(fieldInfos != NULL,"No memory could be allocated for fieldInfos");
//Create the name of the frequence file with suffix .frq in buf
SegmentName(buf,CL_MAX_PATH, ".frq");
//Open an IndexInput freqStream to the frequency file
#ifdef LUCENE_FS_MMAP
if ( strcmp(cfsDir->getDirectoryType(), "FS") == 0 ){
FSDirectory* fsdir = (FSDirectory*)cfsDir;
freqStream = fsdir->openMMapFile( buf );
}else
#endif
freqStream = cfsDir->openInput( buf );
//Condition check to see if freqStream points to a valid instance and was able to open the
//frequency file
CND_CONDITION(freqStream != NULL, "IndexInput freqStream could not open the frequency file");
//Create the name of the prox file with suffix .prx in buf
SegmentName(buf, CL_MAX_PATH,".prx");
//Open an IndexInput proxStream to the prox file
proxStream = cfsDir->openInput( buf );
//Condition check to see if proxStream points to a valid instance and was able to open the
//prox file
CND_CONDITION(proxStream != NULL, "IndexInput proxStream could not open proximity file");
//Instantiate a FieldsReader for reading the Field Info File
fieldsReader = _CLNEW FieldsReader(cfsDir, segment, fieldInfos);
//Condition check to see if fieldsReader points to a valid instance
CND_CONDITION(fieldsReader != NULL,"No memory could be allocated for fieldsReader");
//Instantiate a TermInfosReader for reading the Term Dictionary .tis file
tis = _CLNEW TermInfosReader(cfsDir, segment, fieldInfos);
//Condition check to see if tis points to a valid instance
CND_CONDITION(tis != NULL,"No memory could be allocated for tis");
//Check if the segment has deletion according to the SegmentInfo instance si->
// NOTE: the bitvector is stored using the regular directory, not cfs
if (hasDeletions(si)){
//Create a deletion file with suffix .del
SegmentName(buf, CL_MAX_PATH,".del");
//Instantiate a BitVector that manages which documents have been deleted
deletedDocs = _CLNEW BitSet(getDirectory(), buf );
}
//Open the norm file. There's a norm file for each indexed field with a byte for each document.
//The .f[0-9]* file contains, for each document, a byte that encodes a value
//that is multiplied into the score for hits on that field
openNorms(cfsDir);
if (fieldInfos->hasVectors()) { // open term vector files only as needed
termVectorsReader = _CLNEW TermVectorsReader(cfsDir, segment, fieldInfos);
}else
termVectorsReader = NULL;
}
SegmentReader::~SegmentReader(){
//Func - Destructor.
//Pre - doClose has been invoked!
//Post - the instance has been destroyed
doClose(); //this means that index reader doesn't need to be closed manually
_CLDELETE(fieldInfos);
_CLDELETE(fieldsReader);
_CLDELETE(tis);
_CLDELETE(freqStream);
_CLDELETE(proxStream);
_CLDELETE_CaARRAY(segment);
_CLDELETE(deletedDocs);
_CLDELETE(termVectorsReader)
_CLDECDELETE(cfsReader);
}
void SegmentReader::doCommit(){
char bufdel[CL_MAX_PATH];
strcpy(bufdel,segment);
strcat(bufdel,".del");
if (deletedDocsDirty) { // re-write deleted
char buftmp[CL_MAX_PATH];
strcpy(buftmp,segment);
strcat(buftmp,".tmp");
deletedDocs->write(getDirectory(), buftmp);
getDirectory()->renameFile(buftmp,bufdel);
}
if(undeleteAll && getDirectory()->fileExists(bufdel)){
getDirectory()->deleteFile(bufdel);
}
if (normsDirty) { // re-write norms
CL_NS(util)::CLHashtable<const TCHAR*,Norm*,Compare::TChar,Equals::TChar>::iterator itr = _norms.begin();
Norm* norm;
while (itr != _norms.end()) {
norm = itr->second;
if (norm->dirty) {
norm->reWrite();
}
++itr;
}
}
deletedDocsDirty = false;
normsDirty = false;
undeleteAll = false;
}
void SegmentReader::doClose() {
//Func - Closes all streams to the files of a single segment
//Pre - fieldsReader != NULL
// tis != NULL
//Post - All streams to files have been closed
CND_PRECONDITION(fieldsReader != NULL, "fieldsReader is NULL");
CND_PRECONDITION(tis != NULL, "tis is NULL");
//Close the fieldsReader
fieldsReader->close();
//Close the TermInfosReader
tis->close();
//Close the frequency stream
if (freqStream != NULL){
freqStream->close();
}
//Close the prox stream
if (proxStream != NULL){
proxStream->close();
}
//Close the norm file
closeNorms();
if (termVectorsReader != NULL)
termVectorsReader->close();
if (cfsReader != NULL)
cfsReader->close();
}
bool SegmentReader::hasDeletions() {
return deletedDocs != NULL;
}
//static
bool SegmentReader::usesCompoundFile(SegmentInfo* si) {
char buf[CL_MAX_PATH];
strcpy(buf,si->name);
strcat(buf,".cfs");
return si->getDir()->fileExists(buf);
}
//static
bool SegmentReader::hasSeparateNorms(SegmentInfo* si) {
char** result = si->getDir()->list();
char pattern[CL_MAX_PATH];
strcpy(pattern,si->name);
strcat(pattern,".f");
size_t patternLength = strlen(pattern);
int32_t i=0;
char* res=NULL;
bool ret=false;
while ( (res=result[i]) != NULL ){
if ( !ret ){
if ( strlen(res)>patternLength && strncmp(res,pattern,patternLength) == 0 ){
if ( res[patternLength] >= '0' && res[patternLength] <= '9' )
ret=true;
}
}
_CLDELETE_CaARRAY(result[i]);
++i;
}
_CLDELETE_ARRAY(result);
return ret;
}
bool SegmentReader::hasDeletions(const SegmentInfo* si) {
//Func - Static method
// Checks if a segment managed by SegmentInfo si-> has deletions
//Pre - si-> holds a valid reference to an SegmentInfo instance
//Post - if the segement contains deleteions true is returned otherwise flas
//Create a buffer f of length CL_MAX_PATH
char f[CL_MAX_PATH];
//SegmentReader::segmentname(f, si->name,_T(".del"),-1 );
//create the name of the deletion file
Misc::segmentname(f,CL_MAX_PATH, si->name,".del",-1 );
//Check if the deletion file exists and return the result
return si->getDir()->fileExists( f );
}
//synchronized
void SegmentReader::doDelete(const int32_t docNum){
//Func - Marks document docNum as deleted
//Pre - docNum >=0 and DocNum < maxDoc()
// docNum contains the number of the document that must be
// marked deleted
//Post - The document identified by docNum has been marked deleted
SCOPED_LOCK_MUTEX(THIS_LOCK)
CND_PRECONDITION(docNum >= 0, "docNum is a negative number");
CND_PRECONDITION(docNum < maxDoc(), "docNum is bigger than the total number of documents");
//Check if deletedDocs exists
if (deletedDocs == NULL){
deletedDocs = _CLNEW BitSet(maxDoc());
//Condition check to see if deletedDocs points to a valid instance
CND_CONDITION(deletedDocs != NULL,"No memory could be allocated for deletedDocs");
}
//Flag that there are documents marked deleted
deletedDocsDirty = true;
undeleteAll = false;
//Mark document identified by docNum as deleted
deletedDocs->set(docNum);
}
void SegmentReader::doUndeleteAll(){
_CLDELETE(deletedDocs);
deletedDocsDirty = false;
undeleteAll = true;
}
AStringArrayConstWithDeletor* SegmentReader::files() {
//Func - Returns all file names managed by this SegmentReader
//Pre - segment != NULL
//Post - All filenames managed by this SegmentRead have been returned
CND_PRECONDITION(segment != NULL, "segment is NULL");
AStringArrayConstWithDeletor* files = _CLNEW AStringArrayConstWithDeletor(true);
//Condition check to see if files points to a valid instance
CND_CONDITION(files != NULL, "No memory could be allocated for files");
char* temp = NULL;
#define _ADD_SEGMENT(ext) temp = SegmentName( ext ); if ( getDirectory()->fileExists(temp) ) files->push_back(temp); else _CLDELETE_CaARRAY(temp);
//Add the name of the Field Info file
_ADD_SEGMENT(".cfs" );
_ADD_SEGMENT(".fnm" );
_ADD_SEGMENT(".fdx" );
_ADD_SEGMENT(".fdt" );
_ADD_SEGMENT(".tii" );
_ADD_SEGMENT(".tis" );
_ADD_SEGMENT(".frq" );
_ADD_SEGMENT(".prx" );
_ADD_SEGMENT(".del" );
_ADD_SEGMENT(".tvx" );
_ADD_SEGMENT(".tvd" );
_ADD_SEGMENT(".tvf" );
_ADD_SEGMENT(".tvp" );
//iterate through the field infos
FieldInfo* fi;
for (int32_t i = 0; i < fieldInfos->size(); ++i) {
//Get the field info for the i-th field
fi = fieldInfos->fieldInfo(i);
//Check if the field has been indexed
if (fi->isIndexed){
//The field has been indexed so add its norm file
files->push_back( SegmentName(".f", i) );
}
}
return files;
}
TermEnum* SegmentReader::terms() const {
//Func - Returns an enumeration of all the Terms and TermInfos in the set.
//Pre - tis != NULL
//Post - An enumeration of all the Terms and TermInfos in the set has been returned
CND_PRECONDITION(tis != NULL, "tis is NULL");
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -