📄 termvectorsreader.java
字号:
//We need to offset by tvx.seek((docNumber * 8L) + FORMAT_SIZE); long position = tvx.readLong(); tvd.seek(position); int fieldCount = tvd.readVInt(); // No fields are vectorized for this document if (fieldCount != 0) { int number = 0; String[] fields = new String[fieldCount]; for (int i = 0; i < fieldCount; i++) { if(tvdFormat == FORMAT_VERSION) number = tvd.readVInt(); else number += tvd.readVInt(); fields[i] = fieldInfos.fieldName(number); } // Compute position in the tvf file position = 0; long[] tvfPointers = new long[fieldCount]; for (int i = 0; i < fieldCount; i++) { position += tvd.readVLong(); tvfPointers[i] = position; } mapper.setDocumentNumber(docNumber); readTermVectors(fields, tvfPointers, mapper); } } else { //System.out.println("No tvx file"); } } private SegmentTermVector[] readTermVectors(int docNum, String fields[], long tvfPointers[]) throws IOException { SegmentTermVector res[] = new SegmentTermVector[fields.length]; for (int i = 0; i < fields.length; i++) { ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper(); mapper.setDocumentNumber(docNum); readTermVector(fields[i], tvfPointers[i], mapper); res[i] = (SegmentTermVector) mapper.materializeVector(); } return res; } private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper) throws IOException { for (int i = 0; i < fields.length; i++) { readTermVector(fields[i], tvfPointers[i], mapper); } } /** * * @param field The field to read in * @param tvfPointer The pointer within the tvf file where we should start reading * @param mapper The mapper used to map the TermVector * @return The TermVector located at that position * @throws IOException */ private void readTermVector(String field, long tvfPointer, TermVectorMapper mapper) throws IOException { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.seek(tvfPointer); int numTerms = tvf.readVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) return; boolean storePositions; boolean storeOffsets; if(tvfFormat == FORMAT_VERSION){ byte bits = tvf.readByte(); storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; } else{ tvf.readVInt(); storePositions = false; storeOffsets = false; } mapper.setExpectations(field, numTerms, storeOffsets, storePositions); int start = 0; int deltaLength = 0; int totalLength = 0; char [] buffer = new char[10]; // init the buffer with a length of 10 character char[] previousBuffer = {}; for (int i = 0; i < numTerms; i++) { start = tvf.readVInt(); deltaLength = tvf.readVInt(); totalLength = start + deltaLength; if (buffer.length < totalLength) { // increase buffer buffer = null; // give a hint to garbage collector buffer = new char[totalLength]; if (start > 0) // just copy if necessary System.arraycopy(previousBuffer, 0, buffer, 0, start); } tvf.readChars(buffer, start, deltaLength); String term = new String(buffer, 0, totalLength); previousBuffer = buffer; int freq = tvf.readVInt(); int [] positions = null; if (storePositions) { //read in the positions //does the mapper even care about positions? if (mapper.isIgnoringPositions() == false) { positions = new int[freq]; int prevPosition = 0; for (int j = 0; j < freq; j++) { positions[j] = prevPosition + tvf.readVInt(); prevPosition = positions[j]; } } else { //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip // for (int j = 0; j < freq; j++) { tvf.readVInt(); } } } TermVectorOffsetInfo[] offsets = null; if (storeOffsets) { //does the mapper even care about offsets? if (mapper.isIgnoringOffsets() == false) { offsets = new TermVectorOffsetInfo[freq]; int prevOffset = 0; for (int j = 0; j < freq; j++) { int startOffset = prevOffset + tvf.readVInt(); int endOffset = startOffset + tvf.readVInt(); offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); prevOffset = endOffset; } } else { for (int j = 0; j < freq; j++){ tvf.readVInt(); tvf.readVInt(); } } } mapper.map(term, freq, offsets, positions); } } protected Object clone() { if (tvx == null || tvd == null || tvf == null) return null; TermVectorsReader clone = null; try { clone = (TermVectorsReader) super.clone(); } catch (CloneNotSupportedException e) {} clone.tvx = (IndexInput) tvx.clone(); clone.tvd = (IndexInput) tvd.clone(); clone.tvf = (IndexInput) tvf.clone(); return clone; }}/** * Models the existing parallel array structure */class ParallelArrayTermVectorMapper extends TermVectorMapper{ private String[] terms; private int[] termFreqs; private int positions[][]; private TermVectorOffsetInfo offsets[][]; private int currentPosition; private boolean storingOffsets; private boolean storingPositions; private String field; public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { this.field = field; terms = new String[numTerms]; termFreqs = new int[numTerms]; this.storingOffsets = storeOffsets; this.storingPositions = storePositions; if(storePositions) this.positions = new int[numTerms][]; if(storeOffsets) this.offsets = new TermVectorOffsetInfo[numTerms][]; } public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { terms[currentPosition] = term; termFreqs[currentPosition] = frequency; if (storingOffsets) { this.offsets[currentPosition] = offsets; } if (storingPositions) { this.positions[currentPosition] = positions; } currentPosition++; } /** * Construct the vector * @return The {@link TermFreqVector} based on the mappings. */ public TermFreqVector materializeVector() { SegmentTermVector tv = null; if (field != null && terms != null) { if (storingPositions || storingOffsets) { tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets); } else { tv = new SegmentTermVector(field, terms, termFreqs); } } return tv; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -