📄 termvectorsreader.java

📁 lucene-2.4.0 是一个全文收索的工具包
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package org.apache.lucene.index;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.store.BufferedIndexInput;import org.apache.lucene.store.Directory;import org.apache.lucene.store.IndexInput;import java.io.IOException;import java.util.Arrays;/** * @version $Id: TermVectorsReader.java 687046 2008-08-19 13:01:11Z mikemccand $ */class TermVectorsReader implements Cloneable {  // NOTE: if you make a new format, it must be larger than  // the current format  static final int FORMAT_VERSION = 2;  // Changes to speed up bulk merging of term vectors:  static final int FORMAT_VERSION2 = 3;  // Changed strings to UTF8 with length-in-bytes not length-in-chars  static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;  // NOTE: always change this if you switch to a new format!  static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;  //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file   static final int FORMAT_SIZE = 4;  static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;  static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;    private FieldInfos fieldInfos;  private IndexInput tvx;  private IndexInput tvd;  private IndexInput tvf;  private int size;  private int numTotalDocs;  // The docID offset where our docs begin in the index  // file.  This will be 0 if we have our own private file.  private int docStoreOffset;    private final int format;  TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)    throws CorruptIndexException, IOException {    this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE);  }  TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize)    throws CorruptIndexException, IOException {    this(d, segment, fieldInfos, readBufferSize, -1, 0);  }      TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)    throws CorruptIndexException, IOException {    boolean success = false;    try {      if (d.fileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION)) {        tvx = d.openInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize);        format = checkValidFormat(tvx);        tvd = d.openInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize);        final int tvdFormat = checkValidFormat(tvd);        tvf = d.openInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize);        final int tvfFormat = checkValidFormat(tvf);        assert format == tvdFormat;        assert format == tvfFormat;        if (format >= FORMAT_VERSION2) {          assert (tvx.length()-FORMAT_SIZE) % 16 == 0;          numTotalDocs = (int) (tvx.length() >> 4);        } else {          assert (tvx.length()-FORMAT_SIZE) % 8 == 0;          numTotalDocs = (int) (tvx.length() >> 3);        }        if (-1 == docStoreOffset) {          this.docStoreOffset = 0;          this.size = numTotalDocs;          assert size == 0 || numTotalDocs == size;        } else {          this.docStoreOffset = docStoreOffset;          this.size = size;          // Verify the file is long enough to hold all of our          // docs          assert numTotalDocs >= size + docStoreOffset: "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset;        }      } else        format = 0;      this.fieldInfos = fieldInfos;      success = true;    } finally {      // With lock-less commits, it's entirely possible (and      // fine) to hit a FileNotFound exception above. In      // this case, we want to explicitly close any subset      // of things that were opened so that we don't have to      // wait for a GC to do so.      if (!success) {        close();      }    }  }  // Used for bulk copy when merging  IndexInput getTvdStream() {    return tvd;  }  // Used for bulk copy when merging  IndexInput getTvfStream() {    return tvf;  }  final private void seekTvx(final int docNum) throws IOException {    if (format < FORMAT_VERSION2)      tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);    else      tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);  }  boolean canReadRawDocs() {    return format >= FORMAT_UTF8_LENGTH_IN_BYTES;  }  /** Retrieve the length (in bytes) of the tvd and tvf   *  entries for the next numDocs starting with   *  startDocID.  This is used for bulk copying when   *  merging segments, if the field numbers are   *  congruent.  Once this returns, the tvf & tvd streams   *  are seeked to the startDocID. */  final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {    if (tvx == null) {      Arrays.fill(tvdLengths, 0);      Arrays.fill(tvfLengths, 0);      return;    }    // SegmentMerger calls canReadRawDocs() first and should    // not call us if that returns false.    if (format < FORMAT_VERSION2)      throw new IllegalStateException("cannot read raw docs with older term vector formats");    seekTvx(startDocID);    long tvdPosition = tvx.readLong();    tvd.seek(tvdPosition);    long tvfPosition = tvx.readLong();    tvf.seek(tvfPosition);    long lastTvdPosition = tvdPosition;    long lastTvfPosition = tvfPosition;    int count = 0;    while (count < numDocs) {      final int docID = docStoreOffset + startDocID + count + 1;      assert docID <= numTotalDocs;      if (docID < numTotalDocs)  {        tvdPosition = tvx.readLong();        tvfPosition = tvx.readLong();      } else {        tvdPosition = tvd.length();        tvfPosition = tvf.length();        assert count == numDocs-1;      }      tvdLengths[count] = (int) (tvdPosition-lastTvdPosition);      tvfLengths[count] = (int) (tvfPosition-lastTvfPosition);      count++;      lastTvdPosition = tvdPosition;      lastTvfPosition = tvfPosition;    }  }  private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException  {    int format = in.readInt();    if (format > FORMAT_CURRENT) {      throw new CorruptIndexException("Incompatible format version: " + format + " expected "                                       + FORMAT_CURRENT + " or less");    }    return format;  }  void close() throws IOException {    // make all effort to close up. Keep the first exception    // and throw it as a new one.    IOException keep = null;    if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; }    if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; }    if (tvf  != null) try {  tvf.close(); } catch (IOException e) { if (keep == null) keep = e; }    if (keep != null) throw (IOException) keep.fillInStackTrace();  }  /**   *    * @return The number of documents in the reader   */  int size() {    return size;  }  public void get(int docNum, String field, TermVectorMapper mapper) throws IOException {    if (tvx != null) {      int fieldNumber = fieldInfos.fieldNumber(field);      //We need to account for the FORMAT_SIZE at when seeking in the tvx      //We don't need to do this in other seeks because we already have the      // file pointer      //that was written in another file      seekTvx(docNum);      //System.out.println("TVX Pointer: " + tvx.getFilePointer());      long tvdPosition = tvx.readLong();      tvd.seek(tvdPosition);      int fieldCount = tvd.readVInt();      //System.out.println("Num Fields: " + fieldCount);      // There are only a few fields per document. We opt for a full scan      // rather then requiring that they be ordered. We need to read through      // all of the fields anyway to get to the tvf pointers.      int number = 0;      int found = -1;      for (int i = 0; i < fieldCount; i++) {        if (format >= FORMAT_VERSION)          number = tvd.readVInt();        else          number += tvd.readVInt();        if (number == fieldNumber)          found = i;      }      // This field, although valid in the segment, was not found in this      // document      if (found != -1) {        // Compute position in the tvf file        long position;        if (format >= FORMAT_VERSION2)          position = tvx.readLong();        else          position = tvd.readVLong();        for (int i = 1; i <= found; i++)          position += tvd.readVLong();        mapper.setDocumentNumber(docNum);        readTermVector(field, position, mapper);      } else {        //System.out.println("Fieldable not found");      }    } else {      //System.out.println("No tvx file");    }  }  /**   * Retrieve the term vector for the given document and field   * @param docNum The document number to retrieve the vector for   * @param field The field within the document to retrieve   * @return The TermFreqVector for the document and field or null if there is no termVector for this field.   * @throws IOException if there is an error reading the term vector files   */   TermFreqVector get(int docNum, String field) throws IOException {    // Check if no term vectors are available for this segment at all    ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();    get(docNum, field, mapper);    return mapper.materializeVector();  }  // Reads the String[] fields; you have to pre-seek tvd to  // the right point  final private String[] readFields(int fieldCount) throws IOException {    int number = 0;    String[] fields = new String[fieldCount];    for (int i = 0; i < fieldCount; i++) {
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -