📄 termvectorswriter.java

📁 lucene-2.4.0 是一个全文收索的工具包
💻 JAVA
字号:
package org.apache.lucene.index;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.store.Directory;import org.apache.lucene.store.IndexOutput;import org.apache.lucene.util.StringHelper;import org.apache.lucene.util.UnicodeUtil;import java.io.IOException;final class TermVectorsWriter {    private IndexOutput tvx = null, tvd = null, tvf = null;  private FieldInfos fieldInfos;  final UnicodeUtil.UTF8Result[] utf8Results = new UnicodeUtil.UTF8Result[] {new UnicodeUtil.UTF8Result(),                                                                             new UnicodeUtil.UTF8Result()};  public TermVectorsWriter(Directory directory, String segment,                           FieldInfos fieldInfos)    throws IOException {    // Open files for TermVector storage    tvx = directory.createOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);    tvx.writeInt(TermVectorsReader.FORMAT_CURRENT);    tvd = directory.createOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);    tvd.writeInt(TermVectorsReader.FORMAT_CURRENT);    tvf = directory.createOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);    tvf.writeInt(TermVectorsReader.FORMAT_CURRENT);    this.fieldInfos = fieldInfos;  }  /**   * Add a complete document specified by all its term vectors. If document has no   * term vectors, add value for tvx.   *    * @param vectors   * @throws IOException   */  public final void addAllDocVectors(TermFreqVector[] vectors)      throws IOException {    tvx.writeLong(tvd.getFilePointer());    tvx.writeLong(tvf.getFilePointer());    if (vectors != null) {      final int numFields = vectors.length;      tvd.writeVInt(numFields);      long[] fieldPointers = new long[numFields];      for (int i=0; i<numFields; i++) {        fieldPointers[i] = tvf.getFilePointer();        final int fieldNumber = fieldInfos.fieldNumber(vectors[i].getField());        // 1st pass: write field numbers to tvd        tvd.writeVInt(fieldNumber);        final int numTerms = vectors[i].size();        tvf.writeVInt(numTerms);        final TermPositionVector tpVector;        final byte bits;        final boolean storePositions;        final boolean storeOffsets;        if (vectors[i] instanceof TermPositionVector) {          // May have positions & offsets          tpVector = (TermPositionVector) vectors[i];          storePositions = tpVector.size() > 0 && tpVector.getTermPositions(0) != null;          storeOffsets = tpVector.size() > 0 && tpVector.getOffsets(0) != null;          bits = (byte) ((storePositions ? TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR : 0) +                         (storeOffsets ? TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR : 0));        } else {          tpVector = null;          bits = 0;          storePositions = false;          storeOffsets = false;        }        tvf.writeVInt(bits);        final String[] terms = vectors[i].getTerms();        final int[] freqs = vectors[i].getTermFrequencies();        int utf8Upto = 0;        utf8Results[1].length = 0;        for (int j=0; j<numTerms; j++) {          UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].length(), utf8Results[utf8Upto]);                    int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].result,                                                   utf8Results[1-utf8Upto].length,                                                   utf8Results[utf8Upto].result,                                                   utf8Results[utf8Upto].length);          int length = utf8Results[utf8Upto].length - start;          tvf.writeVInt(start);       // write shared prefix length          tvf.writeVInt(length);        // write delta length          tvf.writeBytes(utf8Results[utf8Upto].result, start, length);  // write delta bytes          utf8Upto = 1-utf8Upto;          final int termFreq = freqs[j];          tvf.writeVInt(termFreq);          if (storePositions) {            final int[] positions = tpVector.getTermPositions(j);            if (positions == null)              throw new IllegalStateException("Trying to write positions that are null!");            assert positions.length == termFreq;            // use delta encoding for positions            int lastPosition = 0;            for(int k=0;k<positions.length;k++) {              final int position = positions[k];              tvf.writeVInt(position-lastPosition);              lastPosition = position;            }          }          if (storeOffsets) {            final TermVectorOffsetInfo[] offsets = tpVector.getOffsets(j);            if (offsets == null)              throw new IllegalStateException("Trying to write offsets that are null!");            assert offsets.length == termFreq;            // use delta encoding for offsets            int lastEndOffset = 0;            for(int k=0;k<offsets.length;k++) {              final int startOffset = offsets[k].getStartOffset();              final int endOffset = offsets[k].getEndOffset();              tvf.writeVInt(startOffset-lastEndOffset);              tvf.writeVInt(endOffset-startOffset);              lastEndOffset = endOffset;            }          }        }      }      // 2nd pass: write field pointers to tvd      long lastFieldPointer = fieldPointers[0];      for (int i=1; i<numFields; i++) {        final long fieldPointer = fieldPointers[i];        tvd.writeVLong(fieldPointer-lastFieldPointer);        lastFieldPointer = fieldPointer;      }    } else      tvd.writeVInt(0);  }  /**   * Do a bulk copy of numDocs documents from reader to our   * streams.  This is used to expedite merging, if the   * field numbers are congruent.   */  final void addRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) throws IOException {    long tvdPosition = tvd.getFilePointer();    long tvfPosition = tvf.getFilePointer();    long tvdStart = tvdPosition;    long tvfStart = tvfPosition;    for(int i=0;i<numDocs;i++) {      tvx.writeLong(tvdPosition);      tvdPosition += tvdLengths[i];      tvx.writeLong(tvfPosition);      tvfPosition += tvfLengths[i];    }    tvd.copyBytes(reader.getTvdStream(), tvdPosition-tvdStart);    tvf.copyBytes(reader.getTvfStream(), tvfPosition-tvfStart);    assert tvd.getFilePointer() == tvdPosition;    assert tvf.getFilePointer() == tvfPosition;  }    /** Close all streams. */  final void close() throws IOException {    // make an effort to close all streams we can but remember and re-throw    // the first exception encountered in this process    IOException keep = null;    if (tvx != null)      try {        tvx.close();      } catch (IOException e) {        if (keep == null) keep = e;      }    if (tvd != null)      try {        tvd.close();      } catch (IOException e) {        if (keep == null) keep = e;      }    if (tvf != null)      try {        tvf.close();      } catch (IOException e) {        if (keep == null) keep = e;      }    if (keep != null) throw (IOException) keep.fillInStackTrace();  }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -