📄 termvectorsreader.java
字号:
if (format >= FORMAT_VERSION) number = tvd.readVInt(); else number += tvd.readVInt(); fields[i] = fieldInfos.fieldName(number); } return fields; } // Reads the long[] offsets into TVF; you have to pre-seek // tvx/tvd to the right point final private long[] readTvfPointers(int fieldCount) throws IOException { // Compute position in the tvf file long position; if (format >= FORMAT_VERSION2) position = tvx.readLong(); else position = tvd.readVLong(); long[] tvfPointers = new long[fieldCount]; tvfPointers[0] = position; for (int i = 1; i < fieldCount; i++) { position += tvd.readVLong(); tvfPointers[i] = position; } return tvfPointers; } /** * Return all term vectors stored for this document or null if the could not be read in. * * @param docNum The document number to retrieve the vector for * @return All term frequency vectors * @throws IOException if there is an error reading the term vector files */ TermFreqVector[] get(int docNum) throws IOException { TermFreqVector[] result = null; if (tvx != null) { //We need to offset by seekTvx(docNum); long tvdPosition = tvx.readLong(); tvd.seek(tvdPosition); int fieldCount = tvd.readVInt(); // No fields are vectorized for this document if (fieldCount != 0) { final String[] fields = readFields(fieldCount); final long[] tvfPointers = readTvfPointers(fieldCount); result = readTermVectors(docNum, fields, tvfPointers); } } else { //System.out.println("No tvx file"); } return result; } public void get(int docNumber, TermVectorMapper mapper) throws IOException { // Check if no term vectors are available for this segment at all if (tvx != null) { //We need to offset by seekTvx(docNumber); long tvdPosition = tvx.readLong(); tvd.seek(tvdPosition); int fieldCount = tvd.readVInt(); // No fields are vectorized for this document if (fieldCount != 0) { final String[] fields = readFields(fieldCount); final long[] tvfPointers = readTvfPointers(fieldCount); mapper.setDocumentNumber(docNumber); readTermVectors(fields, tvfPointers, mapper); } } else { //System.out.println("No tvx file"); } } private SegmentTermVector[] readTermVectors(int docNum, String fields[], long tvfPointers[]) throws IOException { SegmentTermVector res[] = new SegmentTermVector[fields.length]; for (int i = 0; i < fields.length; i++) { ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper(); mapper.setDocumentNumber(docNum); readTermVector(fields[i], tvfPointers[i], mapper); res[i] = (SegmentTermVector) mapper.materializeVector(); } return res; } private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper) throws IOException { for (int i = 0; i < fields.length; i++) { readTermVector(fields[i], tvfPointers[i], mapper); } } /** * * @param field The field to read in * @param tvfPointer The pointer within the tvf file where we should start reading * @param mapper The mapper used to map the TermVector * @throws IOException */ private void readTermVector(String field, long tvfPointer, TermVectorMapper mapper) throws IOException { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.seek(tvfPointer); int numTerms = tvf.readVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) return; boolean storePositions; boolean storeOffsets; if (format >= FORMAT_VERSION){ byte bits = tvf.readByte(); storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; } else{ tvf.readVInt(); storePositions = false; storeOffsets = false; } mapper.setExpectations(field, numTerms, storeOffsets, storePositions); int start = 0; int deltaLength = 0; int totalLength = 0; byte[] byteBuffer; char[] charBuffer; final boolean preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES; // init the buffers if (preUTF8) { charBuffer = new char[10]; byteBuffer = null; } else { charBuffer = null; byteBuffer = new byte[20]; } for (int i = 0; i < numTerms; i++) { start = tvf.readVInt(); deltaLength = tvf.readVInt(); totalLength = start + deltaLength; final String term; if (preUTF8) { // Term stored as java chars if (charBuffer.length < totalLength) { char[] newCharBuffer = new char[(int) (1.5*totalLength)]; System.arraycopy(charBuffer, 0, newCharBuffer, 0, start); charBuffer = newCharBuffer; } tvf.readChars(charBuffer, start, deltaLength); term = new String(charBuffer, 0, totalLength); } else { // Term stored as utf8 bytes if (byteBuffer.length < totalLength) { byte[] newByteBuffer = new byte[(int) (1.5*totalLength)]; System.arraycopy(byteBuffer, 0, newByteBuffer, 0, start); byteBuffer = newByteBuffer; } tvf.readBytes(byteBuffer, start, deltaLength); term = new String(byteBuffer, 0, totalLength, "UTF-8"); } int freq = tvf.readVInt(); int [] positions = null; if (storePositions) { //read in the positions //does the mapper even care about positions? if (mapper.isIgnoringPositions() == false) { positions = new int[freq]; int prevPosition = 0; for (int j = 0; j < freq; j++) { positions[j] = prevPosition + tvf.readVInt(); prevPosition = positions[j]; } } else { //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip // for (int j = 0; j < freq; j++) { tvf.readVInt(); } } } TermVectorOffsetInfo[] offsets = null; if (storeOffsets) { //does the mapper even care about offsets? if (mapper.isIgnoringOffsets() == false) { offsets = new TermVectorOffsetInfo[freq]; int prevOffset = 0; for (int j = 0; j < freq; j++) { int startOffset = prevOffset + tvf.readVInt(); int endOffset = startOffset + tvf.readVInt(); offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); prevOffset = endOffset; } } else { for (int j = 0; j < freq; j++){ tvf.readVInt(); tvf.readVInt(); } } } mapper.map(term, freq, offsets, positions); } } protected Object clone() throws CloneNotSupportedException { final TermVectorsReader clone = (TermVectorsReader) super.clone(); // These are null when a TermVectorsReader was created // on a segment that did not have term vectors saved if (tvx != null && tvd != null && tvf != null) { clone.tvx = (IndexInput) tvx.clone(); clone.tvd = (IndexInput) tvd.clone(); clone.tvf = (IndexInput) tvf.clone(); } return clone; }}/** * Models the existing parallel array structure */class ParallelArrayTermVectorMapper extends TermVectorMapper{ private String[] terms; private int[] termFreqs; private int positions[][]; private TermVectorOffsetInfo offsets[][]; private int currentPosition; private boolean storingOffsets; private boolean storingPositions; private String field; public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { this.field = field; terms = new String[numTerms]; termFreqs = new int[numTerms]; this.storingOffsets = storeOffsets; this.storingPositions = storePositions; if(storePositions) this.positions = new int[numTerms][]; if(storeOffsets) this.offsets = new TermVectorOffsetInfo[numTerms][]; } public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { terms[currentPosition] = term; termFreqs[currentPosition] = frequency; if (storingOffsets) { this.offsets[currentPosition] = offsets; } if (storingPositions) { this.positions[currentPosition] = positions; } currentPosition++; } /** * Construct the vector * @return The {@link TermFreqVector} based on the mappings. */ public TermFreqVector materializeVector() { SegmentTermVector tv = null; if (field != null && terms != null) { if (storingPositions || storingOffsets) { tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets); } else { tv = new SegmentTermVector(field, terms, termFreqs); } } return tv; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -