📄 termvectorsreader.java

📁 lucene完整源码
💻 JAVA
字号:
package org.apache.lucene.index;/** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.store.Directory;import org.apache.lucene.store.IndexInput;import java.io.IOException;/** * @version $Id: TermVectorsReader.java 170226 2005-05-15 15:04:39Z bmesser $ */class TermVectorsReader implements Cloneable {  private FieldInfos fieldInfos;  private IndexInput tvx;  private IndexInput tvd;  private IndexInput tvf;  private int size;    private int tvdFormat;  private int tvfFormat;  TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)    throws IOException {    if (d.fileExists(segment + TermVectorsWriter.TVX_EXTENSION)) {      tvx = d.openInput(segment + TermVectorsWriter.TVX_EXTENSION);      checkValidFormat(tvx);      tvd = d.openInput(segment + TermVectorsWriter.TVD_EXTENSION);      tvdFormat = checkValidFormat(tvd);      tvf = d.openInput(segment + TermVectorsWriter.TVF_EXTENSION);      tvfFormat = checkValidFormat(tvf);      size = (int) tvx.length() / 8;    }    this.fieldInfos = fieldInfos;  }    private int checkValidFormat(IndexInput in) throws IOException  {    int format = in.readInt();    if (format > TermVectorsWriter.FORMAT_VERSION)    {      throw new IOException("Incompatible format version: " + format + " expected "               + TermVectorsWriter.FORMAT_VERSION + " or less");    }    return format;  }  void close() throws IOException {  	// make all effort to close up. Keep the first exception  	// and throw it as a new one.  	IOException keep = null;  	if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; }  	if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; }  	if (tvf  != null) try {  tvf.close(); } catch (IOException e) { if (keep == null) keep = e; }  	if (keep != null) throw (IOException) keep.fillInStackTrace();  }  /**   *    * @return The number of documents in the reader   */  int size() {    return size;  }  /**   * Retrieve the term vector for the given document and field   * @param docNum The document number to retrieve the vector for   * @param field The field within the document to retrieve   * @return The TermFreqVector for the document and field or null if there is no termVector for this field.   * @throws IOException if there is an error reading the term vector files   */   TermFreqVector get(int docNum, String field) throws IOException {    // Check if no term vectors are available for this segment at all    int fieldNumber = fieldInfos.fieldNumber(field);    TermFreqVector result = null;    if (tvx != null) {      //We need to account for the FORMAT_SIZE at when seeking in the tvx      //We don't need to do this in other seeks because we already have the      // file pointer      //that was written in another file      tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);      //System.out.println("TVX Pointer: " + tvx.getFilePointer());      long position = tvx.readLong();      tvd.seek(position);      int fieldCount = tvd.readVInt();      //System.out.println("Num Fields: " + fieldCount);      // There are only a few fields per document. We opt for a full scan      // rather then requiring that they be ordered. We need to read through      // all of the fields anyway to get to the tvf pointers.      int number = 0;      int found = -1;      for (int i = 0; i < fieldCount; i++) {        if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)          number = tvd.readVInt();        else          number += tvd.readVInt();                if (number == fieldNumber)          found = i;      }      // This field, although valid in the segment, was not found in this      // document      if (found != -1) {        // Compute position in the tvf file        position = 0;        for (int i = 0; i <= found; i++)          position += tvd.readVLong();        result = readTermVector(field, position);      } else {        //System.out.println("Field not found");      }    } else {      //System.out.println("No tvx file");    }    return result;  }  /**   * Return all term vectors stored for this document or null if the could not be read in.   *    * @param docNum The document number to retrieve the vector for   * @return All term frequency vectors   * @throws IOException if there is an error reading the term vector files    */  TermFreqVector[] get(int docNum) throws IOException {    TermFreqVector[] result = null;    // Check if no term vectors are available for this segment at all    if (tvx != null) {      //We need to offset by      tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);      long position = tvx.readLong();      tvd.seek(position);      int fieldCount = tvd.readVInt();      // No fields are vectorized for this document      if (fieldCount != 0) {        int number = 0;        String[] fields = new String[fieldCount];                for (int i = 0; i < fieldCount; i++) {          if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)            number = tvd.readVInt();          else            number += tvd.readVInt();          fields[i] = fieldInfos.fieldName(number);        }        // Compute position in the tvf file        position = 0;        long[] tvfPointers = new long[fieldCount];        for (int i = 0; i < fieldCount; i++) {          position += tvd.readVLong();          tvfPointers[i] = position;        }        result = readTermVectors(fields, tvfPointers);      }    } else {      //System.out.println("No tvx file");    }    return result;  }  private SegmentTermVector[] readTermVectors(String fields[], long tvfPointers[])          throws IOException {    SegmentTermVector res[] = new SegmentTermVector[fields.length];    for (int i = 0; i < fields.length; i++) {      res[i] = readTermVector(fields[i], tvfPointers[i]);    }    return res;  }  /**   *    * @param field The field to read in   * @param tvfPointer The pointer within the tvf file where we should start reading   * @return The TermVector located at that position   * @throws IOException   */   private SegmentTermVector readTermVector(String field, long tvfPointer)          throws IOException {    // Now read the data from specified position    //We don't need to offset by the FORMAT here since the pointer already includes the offset    tvf.seek(tvfPointer);    int numTerms = tvf.readVInt();    //System.out.println("Num Terms: " + numTerms);    // If no terms - return a constant empty termvector. However, this should never occur!    if (numTerms == 0)       return new SegmentTermVector(field, null, null);        boolean storePositions;    boolean storeOffsets;        if(tvfFormat == TermVectorsWriter.FORMAT_VERSION){      byte bits = tvf.readByte();      storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0;      storeOffsets = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0;    }    else{      tvf.readVInt();      storePositions = false;      storeOffsets = false;    }    String terms[] = new String[numTerms];    int termFreqs[] = new int[numTerms];        //  we may not need these, but declare them    int positions[][] = null;    TermVectorOffsetInfo offsets[][] = null;    if(storePositions)      positions = new int[numTerms][];    if(storeOffsets)      offsets = new TermVectorOffsetInfo[numTerms][];        int start = 0;    int deltaLength = 0;    int totalLength = 0;    char [] buffer = new char[10];    // init the buffer with a length of 10 character    char[] previousBuffer = {};        for (int i = 0; i < numTerms; i++) {      start = tvf.readVInt();      deltaLength = tvf.readVInt();      totalLength = start + deltaLength;      if (buffer.length < totalLength) {  // increase buffer        buffer = null;    // give a hint to garbage collector        buffer = new char[totalLength];                if (start > 0)  // just copy if necessary          System.arraycopy(previousBuffer, 0, buffer, 0, start);      }            tvf.readChars(buffer, start, deltaLength);      terms[i] = new String(buffer, 0, totalLength);      previousBuffer = buffer;      int freq = tvf.readVInt();      termFreqs[i] = freq;            if (storePositions) { //read in the positions        int [] pos = new int[freq];        positions[i] = pos;        int prevPosition = 0;        for (int j = 0; j < freq; j++)        {          pos[j] = prevPosition + tvf.readVInt();          prevPosition = pos[j];        }      }            if (storeOffsets) {        TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];        offsets[i] = offs;        int prevOffset = 0;        for (int j = 0; j < freq; j++) {          int startOffset = prevOffset + tvf.readVInt();          int endOffset = startOffset + tvf.readVInt();          offs[j] = new TermVectorOffsetInfo(startOffset, endOffset);          prevOffset = endOffset;        }      }    }        SegmentTermVector tv;    if (storePositions || storeOffsets){      tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);    }    else {      tv = new SegmentTermVector(field, terms, termFreqs);    }    return tv;  }  protected Object clone() {        if (tvx == null || tvd == null || tvf == null)      return null;        TermVectorsReader clone = null;    try {      clone = (TermVectorsReader) super.clone();    } catch (CloneNotSupportedException e) {}    clone.tvx = (IndexInput) tvx.clone();    clone.tvd = (IndexInput) tvd.clone();    clone.tvf = (IndexInput) tvf.clone();        return clone;  }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -