📄 termvectorwriter.cpp
字号:
/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"
#include "TermVector.h"
#include "CLucene/util/Misc.h"
CL_NS_USE(util)
CL_NS_DEF(index)
TermVectorsWriter::TermVectorsWriter(CL_NS(store)::Directory* directory,
const char* segment,FieldInfos* fieldInfos)
{
// Open files for TermVector storage
char fbuf[CL_MAX_NAME];
strcpy(fbuf,segment);
char* fpbuf=fbuf+strlen(fbuf);
strcpy(fpbuf,LUCENE_TVX_EXTENSION);
tvx = directory->createOutput(fbuf);
tvx->writeInt(FORMAT_VERSION);
strcpy(fpbuf,LUCENE_TVD_EXTENSION);
tvd = directory->createOutput(fbuf);
tvd->writeInt(FORMAT_VERSION);
strcpy(fpbuf,LUCENE_TVF_EXTENSION);
tvf = directory->createOutput(fbuf);
tvf->writeInt(FORMAT_VERSION);
this->fieldInfos = fieldInfos;
currentField = NULL;
currentDocPointer = -1;
}
TermVectorsWriter::~TermVectorsWriter(){
if ( tvx != NULL ){
tvx->close();
_CLDELETE(tvx);
}
if ( tvd != NULL ){
tvd->close();
_CLDELETE(tvd);
}
if ( tvf != NULL ){
tvf->close();
_CLDELETE(tvf);
}
}
void TermVectorsWriter::openDocument() {
closeDocument();
currentDocPointer = tvd->getFilePointer();
}
void TermVectorsWriter::closeDocument(){
if (isDocumentOpen()) {
closeField();
writeDoc();
fields.clear();
currentDocPointer = -1;
}
}
bool TermVectorsWriter::isDocumentOpen() const{
return currentDocPointer != -1;
}
void TermVectorsWriter::openField(const TCHAR* field) {
if (!isDocumentOpen())
_CLTHROWA(CL_ERR_InvalidState,"Cannot open field when no document is open.");
closeField();
currentField = _CLNEW TVField(fieldInfos->fieldNumber(field));
}
void TermVectorsWriter::closeField(){
if (isFieldOpen()) {
/* DEBUG */
//System.out.println("closeField()");
/* DEBUG */
// save field and terms
writeField();
fields.push_back(currentField);
terms.clear();
currentField = NULL;
}
}
bool TermVectorsWriter::isFieldOpen() const{
return currentField != NULL;
}
void TermVectorsWriter::addTerm(const TCHAR* termText, int32_t freq) {
if (!isDocumentOpen()) _CLTHROWA(CL_ERR_InvalidState,"Cannot add terms when document is not open");
if (!isFieldOpen()) _CLTHROWA(CL_ERR_InvalidState,"Cannot add terms when field is not open");
addTermInternal(termText, freq);
}
void TermVectorsWriter::addTermInternal(const TCHAR* termText, int32_t freq) {
currentField->length += freq;
TVTerm* term = _CLNEW TVTerm();
term->setTermText(termText);
term->freq = freq;
terms.push_back(term);
}
void TermVectorsWriter::addVectors(TermFreqVector** vectors) {
if (!isDocumentOpen()) _CLTHROWA(CL_ERR_InvalidState,"Cannot add term vectors when document is not open");
if (isFieldOpen()) _CLTHROWA(CL_ERR_InvalidState,"Cannot add term vectors when field is open");
int32_t i = 0;
while ( vectors[i] != NULL ){
addTermFreqVector(vectors[i]);
}
}
void TermVectorsWriter::addTermFreqVector(TermFreqVector* v){
if (!isDocumentOpen()) _CLTHROWA(CL_ERR_InvalidState,"Cannot add term vector when document is not open");
if (isFieldOpen()) _CLTHROWA(CL_ERR_InvalidState,"Cannot add term vector when field is open");
addTermFreqVectorInternal(v);
}
void TermVectorsWriter::addTermFreqVectorInternal(TermFreqVector* v){
openField(v->getField());
const TCHAR** terms = v->getTerms();
const int32_t* freqs = v->getTermFrequencies();
int32_t size = v->size();
for (int32_t i = 0; i < size; i++) {
addTermInternal(terms[i], freqs[i]);
}
_CLDELETE_ARRAY(terms);
closeField();
}
void TermVectorsWriter::close() {
try {
closeDocument();
// make an effort to close all streams we can but remember and re-throw
// the first exception encountered in this process
#define _DOTVWCLOSE(x) if (x != NULL){ \
try { \
x->close(); _CLDELETE(x) \
} catch (CLuceneError& e) { \
if ( e.number() != CL_ERR_IO ) throw e; \
if (ikeep==0)ikeep=e.number(); \
if (keep[0]==0) strcpy(keep,e.what()); \
} catch (...) { \
if (keep[0]==0) strcpy(keep,"Unknown error while closing " #x); \
} \
}
}_CLFINALLY( \
char keep[200]; \
int32_t ikeep=0;
keep[0]=0; \
_DOTVWCLOSE(tvx); \
_DOTVWCLOSE(tvd); \
_DOTVWCLOSE(tvf); \
if (keep[0] != 0 ) { \
_CLTHROWA(ikeep,keep); \
}
);
}
void TermVectorsWriter::writeField() {
// remember where this field is written
currentField->tvfPointer = tvf->getFilePointer();
//System.out.println("Field Pointer: " + currentField.tvfPointer);
int32_t size;
tvf->writeVInt(size = terms.size());
tvf->writeVInt(currentField->length - size);
const TCHAR* lastTermText = LUCENE_BLANK_STRING;
int32_t lastTermTextLen = 0;
// write term ids and positions
for (int32_t i = 0; i < size; i++) {
TVTerm* term = (TVTerm*)terms[i];
//tvf->writeString(term->termText);
int32_t start = CL_NS(util)::Misc::stringDifference(lastTermText, lastTermTextLen,
term->getTermText(),term->getTermTextLen());
int32_t length = term->getTermTextLen() - start;
tvf->writeVInt(start); // write shared prefix length
tvf->writeVInt(length); // write delta length
tvf->writeChars(term->getTermText(), start, length); // write delta chars
tvf->writeVInt(term->freq);
lastTermText = term->getTermText();
lastTermTextLen = term->getTermTextLen();
}
}
void TermVectorsWriter::writeDoc() {
if (isFieldOpen()) _CLTHROWA(CL_ERR_InvalidState,"Field is still open while writing document");
//System.out.println("Writing doc pointer: " + currentDocPointer);
// write document index record
tvx->writeLong(currentDocPointer);
// write document data record
int32_t size = fields.size();
// write the number of fields
tvd->writeVInt(size);
// write field numbers
{ //msvc6 scope fix
for (int32_t i = 0; i < size; i++) {
TVField* field = (TVField*) fields[i];
tvd->writeVInt(field->number);
}
}
// write field pointers
int64_t lastFieldPointer = 0;
{ //msvc6 scope fix
for (int32_t i = 0; i < size; i++) {
TVField* field = (TVField*) fields[i];
tvd->writeVLong(field->tvfPointer - lastFieldPointer);
lastFieldPointer = field->tvfPointer;
}
}
//System.out.println("After writing doc pointer: " + tvx.getFilePointer());
}
const TCHAR* TermVectorsWriter::TVTerm::getTermText() const{
return termText;
}
size_t TermVectorsWriter::TVTerm::getTermTextLen(){
if (termTextLen==-1)
termTextLen = _tcslen(termText);
return termTextLen;
}
void TermVectorsWriter::TVTerm::setTermText(const TCHAR* val){
_CLDELETE_CARRAY(termText);
termText = STRDUP_TtoT(val);
termTextLen = -1;
}
TermVectorsWriter::TVTerm::TVTerm(): freq(0){
termText=NULL;
termTextLen=-1;
}
TermVectorsWriter::TVTerm::~TVTerm(){
_CLDELETE_CARRAY(termText)
}
CL_NS_END
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -