📄 fieldsreader.java

📁 lucene-2.4.0 是一个全文收索的工具包
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package org.apache.lucene.index;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.document.*;import org.apache.lucene.store.Directory;import org.apache.lucene.store.IndexInput;import org.apache.lucene.store.AlreadyClosedException;import org.apache.lucene.store.BufferedIndexInput;import org.apache.lucene.util.CloseableThreadLocal;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.Reader;import java.util.zip.DataFormatException;import java.util.zip.Inflater;/** * Class responsible for access to stored document fields. * <p/> * It uses &lt;segment&gt;.fdt and &lt;segment&gt;.fdx; files. * * @version $Id: FieldsReader.java 695184 2008-09-14 10:32:59Z mikemccand $ */final class FieldsReader {  private final FieldInfos fieldInfos;  // The main fieldStream, used only for cloning.  private final IndexInput cloneableFieldsStream;  // This is a clone of cloneableFieldsStream used for reading documents.  // It should not be cloned outside of a synchronized context.  private final IndexInput fieldsStream;  private final IndexInput indexStream;  private int numTotalDocs;  private int size;  private boolean closed;  private final int format;  private final int formatSize;  // The docID offset where our docs begin in the index  // file.  This will be 0 if we have our own private file.  private int docStoreOffset;  private CloseableThreadLocal fieldsStreamTL = new CloseableThreadLocal();  FieldsReader(Directory d, String segment, FieldInfos fn) throws IOException {    this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE, -1, 0);  }  FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize) throws IOException {    this(d, segment, fn, readBufferSize, -1, 0);  }  FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize, int docStoreOffset, int size) throws IOException {    boolean success = false;    try {      fieldInfos = fn;      cloneableFieldsStream = d.openInput(segment + "." + IndexFileNames.FIELDS_EXTENSION, readBufferSize);      indexStream = d.openInput(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION, readBufferSize);      // First version of fdx did not include a format      // header, but, the first int will always be 0 in that      // case      int firstInt = indexStream.readInt();      if (firstInt == 0)        format = 0;      else        format = firstInt;      if (format > FieldsWriter.FORMAT_CURRENT)        throw new CorruptIndexException("Incompatible format version: " + format + " expected "                                         + FieldsWriter.FORMAT_CURRENT + " or lower");      if (format > FieldsWriter.FORMAT)        formatSize = 4;      else        formatSize = 0;      if (format < FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)        cloneableFieldsStream.setModifiedUTF8StringsMode();      fieldsStream = (IndexInput) cloneableFieldsStream.clone();      final long indexSize = indexStream.length()-formatSize;      if (docStoreOffset != -1) {        // We read only a slice out of this shared fields file        this.docStoreOffset = docStoreOffset;        this.size = size;        // Verify the file is long enough to hold all of our        // docs        assert ((int) (indexSize / 8)) >= size + this.docStoreOffset: "indexSize=" + indexSize + " size=" + size + " docStoreOffset=" + docStoreOffset;      } else {        this.docStoreOffset = 0;        this.size = (int) (indexSize >> 3);      }      numTotalDocs = (int) (indexSize >> 3);      success = true;    } finally {      // With lock-less commits, it's entirely possible (and      // fine) to hit a FileNotFound exception above. In      // this case, we want to explicitly close any subset      // of things that were opened so that we don't have to      // wait for a GC to do so.      if (!success) {        close();      }    }  }  /**   * @throws AlreadyClosedException if this FieldsReader is closed   */  protected final void ensureOpen() throws AlreadyClosedException {    if (closed) {      throw new AlreadyClosedException("this FieldsReader is closed");    }  }  /**   * Closes the underlying {@link org.apache.lucene.store.IndexInput} streams, including any ones associated with a   * lazy implementation of a Field.  This means that the Fields values will not be accessible.   *   * @throws IOException   */  final void close() throws IOException {    if (!closed) {      if (fieldsStream != null) {        fieldsStream.close();      }      if (cloneableFieldsStream != null) {        cloneableFieldsStream.close();      }      if (indexStream != null) {        indexStream.close();      }      fieldsStreamTL.close();      closed = true;    }  }  final int size() {    return size;  }  private final void seekIndex(int docID) throws IOException {    indexStream.seek(formatSize + (docID + docStoreOffset) * 8L);  }  boolean canReadRawDocs() {    return format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;  }  final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {    seekIndex(n);    long position = indexStream.readLong();    fieldsStream.seek(position);    Document doc = new Document();    int numFields = fieldsStream.readVInt();    for (int i = 0; i < numFields; i++) {      int fieldNumber = fieldsStream.readVInt();      FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);      FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name);            byte bits = fieldsStream.readByte();      assert bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY;      boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;      boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;      boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;      //TODO: Find an alternative approach here if this list continues to grow beyond the      //list of 5 or 6 currently here.  See Lucene 762 for discussion      if (acceptField.equals(FieldSelectorResult.LOAD)) {        addField(doc, fi, binary, compressed, tokenize);      }      else if (acceptField.equals(FieldSelectorResult.LOAD_FOR_MERGE)) {        addFieldForMerge(doc, fi, binary, compressed, tokenize);      }      else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){        addField(doc, fi, binary, compressed, tokenize);        break;//Get out of this loop      }      else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) {        addFieldLazy(doc, fi, binary, compressed, tokenize);      }      else if (acceptField.equals(FieldSelectorResult.SIZE)){        skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed));      }      else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){        addFieldSize(doc, fi, binary, compressed);        break;      }      else {        skipField(binary, compressed);      }    }    return doc;  }  /** Returns the length in bytes of each raw document in a   *  contiguous range of length numDocs starting with   *  startDocID.  Returns the IndexInput (the fieldStream),   *  already seeked to the starting point for startDocID.*/  final IndexInput rawDocs(int[] lengths, int startDocID, int numDocs) throws IOException {    seekIndex(startDocID);    long startOffset = indexStream.readLong();    long lastOffset = startOffset;    int count = 0;    while (count < numDocs) {      final long offset;      final int docID = docStoreOffset + startDocID + count + 1;      assert docID <= numTotalDocs;      if (docID < numTotalDocs)         offset = indexStream.readLong();      else        offset = fieldsStream.length();      lengths[count++] = (int) (offset-lastOffset);      lastOffset = offset;    }    fieldsStream.seek(startOffset);    return fieldsStream;  }  /**   * Skip the field.  We still have to read some of the information about the field, but can skip past the actual content.   * This will have the most payoff on large fields.   */  private void skipField(boolean binary, boolean compressed) throws IOException {    skipField(binary, compressed, fieldsStream.readVInt());  }    private void skipField(boolean binary, boolean compressed, int toRead) throws IOException {   if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) {     fieldsStream.seek(fieldsStream.getFilePointer() + toRead);   } else {     // We need to skip chars.  This will slow us down, but still better     fieldsStream.skipChars(toRead);   }  }  private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException {    if (binary) {      int toRead = fieldsStream.readVInt();      long pointer = fieldsStream.getFilePointer();      if (compressed) {        //was: doc.add(new Fieldable(fi.name, uncompress(b), Fieldable.Store.COMPRESS));        doc.add(new LazyField(fi.name, Field.Store.COMPRESS, toRead, pointer, binary));      } else {        //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES));        doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary));      }      //Need to move the pointer ahead by toRead positions      fieldsStream.seek(pointer + toRead);    } else {      Field.Store store = Field.Store.YES;      Field.Index index = getIndexType(fi, tokenize);      Field.TermVector termVector = getTermVectorType(fi);      Fieldable f;      if (compressed) {        store = Field.Store.COMPRESS;        int toRead = fieldsStream.readVInt();        long pointer = fieldsStream.getFilePointer();        f = new LazyField(fi.name, store, toRead, pointer, binary);        //skip over the part that we aren't loading        fieldsStream.seek(pointer + toRead);        f.setOmitNorms(fi.omitNorms);      } else {        int length = fieldsStream.readVInt();        long pointer = fieldsStream.getFilePointer();        //Skip ahead of where we are by the length of what is stored        if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)          fieldsStream.seek(pointer+length);        else          fieldsStream.skipChars(length);        f = new LazyField(fi.name, store, index, termVector, length, pointer, binary);        f.setOmitNorms(fi.omitNorms);      }      doc.add(f);    }  }  // in merge mode we don't uncompress the data of a compressed field  private void addFieldForMerge(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException {    Object data;          if (binary || compressed) {      int toRead = fieldsStream.readVInt();      final byte[] b = new byte[toRead];      fieldsStream.readBytes(b, 0, b.length);
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -