📄 segmentreader.cpp
字号:
return tis->terms();
}
TermEnum* SegmentReader::terms(const Term* t) const {
//Func - Returns an enumeration of terms starting at or after the named term t
//Pre - t != NULL
// tis != NULL
//Post - An enumeration of terms starting at or after the named term t
CND_PRECONDITION(t != NULL, "t is NULL");
CND_PRECONDITION(tis != NULL, "tis is NULL");
return tis->terms(t);
}
Document* SegmentReader::document(const int32_t n) {
//Func - Returns a document identified by n
//Pre - n >=0 and identifies the document n
//Post - if the document has been deleted then an exception has been thrown
// otherwise a reference to the found document has been returned
SCOPED_LOCK_MUTEX(THIS_LOCK)
CND_PRECONDITION(n >= 0, "n is a negative number");
//Check if the n-th document has been marked deleted
if (isDeleted(n)){
_CLTHROWA( CL_ERR_InvalidState,"attempt to access a deleted document" );
}
//Retrieve the n-th document
Document* ret = fieldsReader->doc(n);
//Condition check to see if ret points to a valid instance
CND_CONDITION(ret != NULL, "No document could be retrieved");
//Return the document
return ret;
}
bool SegmentReader::isDeleted(const int32_t n) {
//Func - Checks if the n-th document has been marked deleted
//Pre - n >=0 and identifies the document n
//Post - true has been returned if document n has been deleted otherwise fralse
SCOPED_LOCK_MUTEX(THIS_LOCK)
CND_PRECONDITION(n >= 0, "n is a negative number");
//Is document n deleted
bool ret = (deletedDocs != NULL && deletedDocs->get(n));
return ret;
}
TermDocs* SegmentReader::termDocs() const {
//Func - Returns an unpositioned TermDocs enumerator.
//Pre - true
//Post - An unpositioned TermDocs enumerator has been returned
return _CLNEW SegmentTermDocs(this);
}
TermPositions* SegmentReader::termPositions() const {
//Func - Returns an unpositioned TermPositions enumerator.
//Pre - true
//Post - An unpositioned TermPositions enumerator has been returned
return _CLNEW SegmentTermPositions(this);
}
int32_t SegmentReader::docFreq(const Term* t) const {
//Func - Returns the number of documents which contain the term t
//Pre - t holds a valid reference to a Term
//Post - The number of documents which contain term t has been returned
//Get the TermInfo ti for Term t in the set
TermInfo* ti = tis->get(t);
//Check if an TermInfo has been returned
if (ti){
//Get the frequency of the term
int32_t ret = ti->docFreq;
//TermInfo ti is not needed anymore so delete it
_CLDELETE( ti );
//return the number of documents which containt term t
return ret;
}
else
//No TermInfo returned so return 0
return 0;
}
int32_t SegmentReader::numDocs() {
//Func - Returns the actual number of documents in the segment
//Pre - true
//Post - The actual number of documents in the segments
//Get the number of all the documents in the segment including the ones that have
//been marked deleted
int32_t n = maxDoc();
//Check if there any deleted docs
if (deletedDocs != NULL)
//Substract the number of deleted docs from the number returned by maxDoc
n -= deletedDocs->count();
//return the actual number of documents in the segment
return n;
}
int32_t SegmentReader::maxDoc() const {
//Func - Returns the number of all the documents in the segment including
// the ones that have been marked deleted
//Pre - true
//Post - The total number of documents in the segment has been returned
return fieldsReader->size();
}
void SegmentReader::norms(const TCHAR* field, uint8_t* bytes) {
//Func - Reads the Norms for field from disk starting at offset in the inputstream
//Pre - field != NULL
// bytes != NULL is an array of bytes which is to be used to read the norms into.
// it is advisable to have bytes initalized by zeroes!
//Post - The if an inputstream to the norm file could be retrieved the bytes have been read
// You are never sure whether or not the norms have been read into bytes properly!!!!!!!!!!!!!!!!!
CND_PRECONDITION(field != NULL, "field is NULL");
CND_PRECONDITION(bytes != NULL, "field is NULL");
SCOPED_LOCK_MUTEX(THIS_LOCK)
Norm* norm = _norms.get(field);
if ( norm == NULL )
return; // use zeros in array
if (norm->bytes != NULL) { // can copy from cache
memcpy(bytes,norm->bytes,maxDoc());
return;
}
IndexInput* _normStream = norm->in->clone();
CND_PRECONDITION(_normStream != NULL, "normStream==NULL")
// read from disk
try{
_normStream->seek(0);
_normStream->readBytes(bytes, maxDoc());
}_CLFINALLY(
//Have the normstream closed
_normStream->close();
//Destroy the normstream
_CLDELETE( _normStream );
);
}
uint8_t* SegmentReader::norms(const TCHAR* field) {
//Func - Returns the bytes array that holds the norms of a named field
//Pre - field != NULL and contains the name of the field for which the norms
// must be retrieved
//Post - If there was norm for the named field then a bytes array has been allocated
// and returned containing the norms for that field. If the named field is unknown NULL is returned.
CND_PRECONDITION(field != NULL, "field is NULL");
SCOPED_LOCK_MUTEX(THIS_LOCK)
//Try to retrieve the norms for field
Norm* norm = (Norm*)_norms.get(field);
//Check if a norm instance was found
if (norm == NULL){
//return NULL as there are no norms to be returned
return NULL;
}
if (norm->bytes == NULL) { //value not read yet
//allocate a new bytes array to hold the norms
uint8_t* bytes = _CL_NEWARRAY(uint8_t,maxDoc());
//Condition check to see if bytes points to a valid array
CND_CONDITION(bytes != NULL, "bytes is NULL");
//Read the norms from disk straight into the new bytes array
norms(field, bytes);
norm->bytes = bytes; // cache it
}
//Return the norms
return norm->bytes;
}
void SegmentReader::doSetNorm(int32_t doc, const TCHAR* field, uint8_t value){
Norm* norm = _norms.get(field);
if (norm == NULL) // not an indexed field
return;
norm->dirty = true; // mark it dirty
normsDirty = true;
uint8_t* bits = norms(field);
bits[doc] = value; // set the value
}
char* SegmentReader::SegmentName(const char* ext, const int32_t x){
//Func - Returns an allocated buffer in which it creates a filename by
// concatenating segment with ext and x
//Pre ext != NULL and holds the extension
// x contains a number
//Post - A buffer has been instantiated an when x = -1 buffer contains the concatenation of
// segment and ext otherwise buffer contains the contentation of segment, ext and x
CND_PRECONDITION(ext != NULL, "ext is NULL");
//Create a buffer of length CL_MAX_PATH
char* buf = _CL_NEWARRAY(char,CL_MAX_PATH);
//Create the filename
SegmentName(buf,CL_MAX_PATH,ext,x);
return buf ;
}
void SegmentReader::SegmentName(char* buffer,int32_t bufferLen, const char* ext, const int32_t x ){
//Func - Creates a filename in buffer by concatenating segment with ext and x
//Pre - buffer != NULL
// ext != NULL
// x contains a number
//Post - When x = -1 buffer contains the concatenation of segment and ext otherwise
// buffer contains the contentation of segment, ext and x
CND_PRECONDITION(buffer != NULL, "buffer is NULL");
CND_PRECONDITION(segment != NULL, "Segment is NULL");
Misc::segmentname(buffer,bufferLen,segment,ext,x);
}
void SegmentReader::openNorms(Directory* cfsDir) {
//Func - Open all norms files for all fields
// Creates for each field a norm Instance with an open inputstream to
// a corresponding norm file ready to be read
//Pre - true
//Post - For each field a norm instance has been created with an open inputstream to
// a corresponding norm file ready to be read
//Iterate through all the fields
FieldInfo* fi;
Directory* d;
for (int32_t i = 0; i < fieldInfos->size(); ++i) {
//Get the FieldInfo for the i-th field
fi = fieldInfos->fieldInfo(i);
//Check if the field is indexed
if (fi->isIndexed) {
//Allocate a buffer
char fileName[CL_MAX_PATH];
//Create a filename for the norm file
SegmentName(fileName,CL_MAX_PATH, ".f", fi->number);
//TODO, should fi->name be copied?
//Create a new Norm with an open inputstream to f and store
//it at fi->name in norms
d = getDirectory();
if ( !d->fileExists(fileName) )
d = cfsDir;
_norms.put(fi->name, _CLNEW Norm( d->openInput( fileName ),fi->number, this, segment ));
}
}
}
void SegmentReader::closeNorms() {
//Func - Close all the norms stored in norms
//Pre - true
//Post - All the norms have been destroyed
SCOPED_LOCK_MUTEX(_norms.THIS_LOCK)
//Create an interator initialized at the beginning of norms
CL_NS(util)::CLHashtable<const TCHAR*,Norm*,Compare::TChar,Equals::TChar>::iterator itr = _norms.begin();
//Iterate through all the norms
while (itr != _norms.end()) {
//Get the norm
Norm* n = itr->second;
//delete the norm n
_CLDELETE(n);
//Move the interator to the next norm in the norms collection.
//Note ++ is an overloaded operator
++itr;
}
_norms.clear(); //bvk: they're deleted, so clear them so that they are not re-used
}
/**
* @see IndexReader#getFieldNames()
*/
TCHAR** SegmentReader::getFieldNames(){
// maintain a unique set of field names
int32_t len = fieldInfos->size();
TCHAR** ret = _CL_NEWARRAY(TCHAR*,len+1);
int32_t i = 0;
int32_t p = 0;
FieldInfo* fi;
int32_t j;
for (i = 0; i < len; ++i) {
fi = fieldInfos->fieldInfo(i);
for ( j =0;j<i;++j )
if ( _tcscmp(fi->name,ret[j]) == 0 )
continue;
ret[p++]=STRDUP_TtoT(fi->name);
}
ret[p]=NULL;
return ret;
}
/**
* @see IndexReader#getFieldNames(boolean)
*/
TCHAR** SegmentReader::getFieldNames(bool indexed) {
// maintain a unique set of field names
CL_NS(util)::CLSetList<const TCHAR*> fieldSet(false);
int32_t i = 0;
FieldInfo* fi;
for (i = 0; i < fieldInfos->size(); ++i) {
fi = fieldInfos->fieldInfo(i);
if (fi->isIndexed == indexed){
if ( fieldSet.find(fi->name)==fieldSet.end() )
fieldSet.insert(fi->name);
}
}
TCHAR** ret = _CL_NEWARRAY(TCHAR*,fieldSet.size()+1);
int j=0;
CL_NS(util)::CLSetList<const TCHAR*>::iterator itr = fieldSet.begin();
while ( itr != fieldSet.end() ){
const TCHAR* t = *itr;
ret[j]=STRDUP_TtoT(t);
++j;
++itr;
}
ret[fieldSet.size()]=NULL;
return ret;
}
/**
*
* @param storedTermVector if true, returns only Indexed fields that have term vector info,
* else only indexed fields without term vector info
* @return Collection of Strings indicating the names of the fields
*/
TCHAR** SegmentReader::getIndexedFieldNames(bool storedTermVector) {
// maintain a unique set of field names
CL_NS(util)::CLSetList<const TCHAR*> fieldSet(false);
int32_t i = 0;
FieldInfo* fi;
for (i = 0; i < fieldInfos->size(); ++i) {
fi = fieldInfos->fieldInfo(i);
if (fi->isIndexed == true && fi->storeTermVector == storedTermVector){
if ( fieldSet.find((const TCHAR*)fi->name)==fieldSet.end() )
fieldSet.insert(fi->name);
}
}
TCHAR** ret = _CL_NEWARRAY(TCHAR*,fieldSet.size()+1);
int j=0;
CL_NS(util)::CLSetList<const TCHAR*>::iterator itr = fieldSet.begin();
while ( itr != fieldSet.end() ){
const TCHAR* t = *itr;
ret[j]=STRDUP_TtoT(t);
++j;
++itr;
}
ret[fieldSet.size()]=NULL;
return ret;
}
TermFreqVector* SegmentReader::getTermFreqVector(int32_t docNumber, const TCHAR* field){
// Check if this field is invalid or has no stored term vector
FieldInfo* fi = fieldInfos->fieldInfo(field);
if (fi == NULL || !fi->storeTermVector)
return NULL;
return termVectorsReader->get(docNumber, field);
}
TermFreqVector** SegmentReader::getTermFreqVectors(int32_t docNumber){
if (termVectorsReader == NULL)
return NULL;
return termVectorsReader->get(docNumber);
}
CL_NS_END
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -