📄 termvectorsreader.java
字号:
package org.apache.lucene.index;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.store.BufferedIndexInput;import org.apache.lucene.store.Directory;import org.apache.lucene.store.IndexInput;import java.io.IOException;/** * @version $Id: TermVectorsReader.java 601337 2007-12-05 13:59:37Z mikemccand $ */class TermVectorsReader implements Cloneable { static final int FORMAT_VERSION = 2; //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file static final int FORMAT_SIZE = 4; static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1; static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2; private FieldInfos fieldInfos; private IndexInput tvx; private IndexInput tvd; private IndexInput tvf; private int size; // The docID offset where our docs begin in the index // file. This will be 0 if we have our own private file. private int docStoreOffset; private int tvdFormat; private int tvfFormat; TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos) throws CorruptIndexException, IOException { this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE); } TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize) throws CorruptIndexException, IOException { this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE, -1, 0); } TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size) throws CorruptIndexException, IOException { boolean success = false; try { if (d.fileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION)) { tvx = d.openInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize); checkValidFormat(tvx); tvd = d.openInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize); tvdFormat = checkValidFormat(tvd); tvf = d.openInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize); tvfFormat = checkValidFormat(tvf); if (-1 == docStoreOffset) { this.docStoreOffset = 0; this.size = (int) (tvx.length() >> 3); } else { this.docStoreOffset = docStoreOffset; this.size = size; // Verify the file is long enough to hold all of our // docs assert ((int) (tvx.length() / 8)) >= size + docStoreOffset; } } this.fieldInfos = fieldInfos; success = true; } finally { // With lock-less commits, it's entirely possible (and // fine) to hit a FileNotFound exception above. In // this case, we want to explicitly close any subset // of things that were opened so that we don't have to // wait for a GC to do so. if (!success) { close(); } } } private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException { int format = in.readInt(); if (format > FORMAT_VERSION) { throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FORMAT_VERSION + " or less"); } return format; } void close() throws IOException { // make all effort to close up. Keep the first exception // and throw it as a new one. IOException keep = null; if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; } if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; } if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; } if (keep != null) throw (IOException) keep.fillInStackTrace(); } /** * * @return The number of documents in the reader */ int size() { return size; } public void get(int docNum, String field, TermVectorMapper mapper) throws IOException { if (tvx != null) { int fieldNumber = fieldInfos.fieldNumber(field); //We need to account for the FORMAT_SIZE at when seeking in the tvx //We don't need to do this in other seeks because we already have the // file pointer //that was written in another file tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE); //System.out.println("TVX Pointer: " + tvx.getFilePointer()); long position = tvx.readLong(); tvd.seek(position); int fieldCount = tvd.readVInt(); //System.out.println("Num Fields: " + fieldCount); // There are only a few fields per document. We opt for a full scan // rather then requiring that they be ordered. We need to read through // all of the fields anyway to get to the tvf pointers. int number = 0; int found = -1; for (int i = 0; i < fieldCount; i++) { if(tvdFormat == FORMAT_VERSION) number = tvd.readVInt(); else number += tvd.readVInt(); if (number == fieldNumber) found = i; } // This field, although valid in the segment, was not found in this // document if (found != -1) { // Compute position in the tvf file position = 0; for (int i = 0; i <= found; i++) position += tvd.readVLong(); mapper.setDocumentNumber(docNum); readTermVector(field, position, mapper); } else { //System.out.println("Fieldable not found"); } } else { //System.out.println("No tvx file"); } } /** * Retrieve the term vector for the given document and field * @param docNum The document number to retrieve the vector for * @param field The field within the document to retrieve * @return The TermFreqVector for the document and field or null if there is no termVector for this field. * @throws IOException if there is an error reading the term vector files */ TermFreqVector get(int docNum, String field) throws IOException { // Check if no term vectors are available for this segment at all ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper(); get(docNum, field, mapper); return mapper.materializeVector(); } /** * Return all term vectors stored for this document or null if the could not be read in. * * @param docNum The document number to retrieve the vector for * @return All term frequency vectors * @throws IOException if there is an error reading the term vector files */ TermFreqVector[] get(int docNum) throws IOException { TermFreqVector[] result = null; if (tvx != null) { //We need to offset by tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE); long position = tvx.readLong(); tvd.seek(position); int fieldCount = tvd.readVInt(); // No fields are vectorized for this document if (fieldCount != 0) { int number = 0; String[] fields = new String[fieldCount]; for (int i = 0; i < fieldCount; i++) { if(tvdFormat == FORMAT_VERSION) number = tvd.readVInt(); else number += tvd.readVInt(); fields[i] = fieldInfos.fieldName(number); } // Compute position in the tvf file position = 0; long[] tvfPointers = new long[fieldCount]; for (int i = 0; i < fieldCount; i++) { position += tvd.readVLong(); tvfPointers[i] = position; } result = readTermVectors(docNum, fields, tvfPointers); } } else { //System.out.println("No tvx file"); } return result; } public void get(int docNumber, TermVectorMapper mapper) throws IOException { // Check if no term vectors are available for this segment at all if (tvx != null) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -