📄 segmentmerger.java
字号:
package org.apache.lucene.index;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.util.Vector;import java.util.Iterator;import java.util.Collection;import java.io.IOException;import org.apache.lucene.document.FieldSelector;import org.apache.lucene.document.FieldSelectorResult;import org.apache.lucene.store.Directory;import org.apache.lucene.store.IndexOutput;import org.apache.lucene.store.IndexInput;/** * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add}, * into a single Segment. After adding the appropriate readers, call the merge method to combine the * segments. *<P> * If the compoundFile flag is set, then the segments will be merged into a compound file. * * * @see #merge * @see #add */final class SegmentMerger { /** norms header placeholder */ static final byte[] NORMS_HEADER = new byte[]{'N','R','M',-1}; private Directory directory; private String segment; private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL; private Vector readers = new Vector(); private FieldInfos fieldInfos; private int mergedDocs; private CheckAbort checkAbort; // Whether we should merge doc stores (stored fields and // vectors files). When all segments we are merging // already share the same doc store files, we don't need // to merge the doc stores. private boolean mergeDocStores; /** Maximum number of contiguous documents to bulk-copy when merging stored fields */ private final static int MAX_RAW_MERGE_DOCS = 4192; /** This ctor used only by test code. * * @param dir The Directory to merge the other segments into * @param name The name of the new segment */ SegmentMerger(Directory dir, String name) { directory = dir; segment = name; } SegmentMerger(IndexWriter writer, String name, MergePolicy.OneMerge merge) { directory = writer.getDirectory(); segment = name; if (merge != null) checkAbort = new CheckAbort(merge, directory); termIndexInterval = writer.getTermIndexInterval(); } /** * Add an IndexReader to the collection of readers that are to be merged * @param reader */ final void add(IndexReader reader) { readers.addElement(reader); } /** * * @param i The index of the reader to return * @return The ith reader to be merged */ final IndexReader segmentReader(int i) { return (IndexReader) readers.elementAt(i); } /** * Merges the readers specified by the {@link #add} method into the directory passed to the constructor * @return The number of documents that were merged * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ final int merge() throws CorruptIndexException, IOException { return merge(true); } /** * Merges the readers specified by the {@link #add} method * into the directory passed to the constructor. * @param mergeDocStores if false, we will not merge the * stored fields nor vectors files * @return The number of documents that were merged * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ final int merge(boolean mergeDocStores) throws CorruptIndexException, IOException { this.mergeDocStores = mergeDocStores; // NOTE: it's important to add calls to // checkAbort.work(...) if you make any changes to this // method that will spend alot of time. The frequency // of this check impacts how long // IndexWriter.close(false) takes to actually stop the // threads. mergedDocs = mergeFields(); mergeTerms(); mergeNorms(); if (mergeDocStores && fieldInfos.hasVectors()) mergeVectors(); return mergedDocs; } /** * close all IndexReaders that have been added. * Should not be called before merge(). * @throws IOException */ final void closeReaders() throws IOException { for (int i = 0; i < readers.size(); i++) { // close readers IndexReader reader = (IndexReader) readers.elementAt(i); reader.close(); } } final Vector createCompoundFile(String fileName) throws IOException { CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, fileName, checkAbort); Vector files = new Vector(IndexFileNames.COMPOUND_EXTENSIONS.length + 1); // Basic files for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) { String ext = IndexFileNames.COMPOUND_EXTENSIONS[i]; if (mergeDocStores || (!ext.equals(IndexFileNames.FIELDS_EXTENSION) && !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION))) files.add(segment + "." + ext); } // Fieldable norm files for (int i = 0; i < fieldInfos.size(); i++) { FieldInfo fi = fieldInfos.fieldInfo(i); if (fi.isIndexed && !fi.omitNorms) { files.add(segment + "." + IndexFileNames.NORMS_EXTENSION); break; } } // Vector files if (fieldInfos.hasVectors() && mergeDocStores) { for (int i = 0; i < IndexFileNames.VECTOR_EXTENSIONS.length; i++) { files.add(segment + "." + IndexFileNames.VECTOR_EXTENSIONS[i]); } } // Now merge all added files Iterator it = files.iterator(); while (it.hasNext()) { cfsWriter.addFile((String) it.next()); } // Perform the merge cfsWriter.close(); return files; } private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean storePayloads) throws IOException { Iterator i = names.iterator(); while (i.hasNext()) { String field = (String)i.next(); fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field), storePayloads); } } /** * * @return The number of documents in all of the readers * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ private final int mergeFields() throws CorruptIndexException, IOException { if (!mergeDocStores) { // When we are not merging by doc stores, that means // all segments were written as part of a single // autoCommit=false IndexWriter session, so their field // name -> number mapping are the same. So, we start // with the fieldInfos of the last segment in this // case, to keep that numbering. final SegmentReader sr = (SegmentReader) readers.elementAt(readers.size()-1); fieldInfos = (FieldInfos) sr.fieldInfos.clone(); } else { fieldInfos = new FieldInfos(); // merge field names } for (int i = 0; i < readers.size(); i++) { IndexReader reader = (IndexReader) readers.elementAt(i); if (reader instanceof SegmentReader) { SegmentReader segmentReader = (SegmentReader) reader; for (int j = 0; j < segmentReader.getFieldInfos().size(); j++) { FieldInfo fi = segmentReader.getFieldInfos().fieldInfo(j); fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, !reader.hasNorms(fi.name), fi.storePayloads); } } else { addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false); addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false); addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false); addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false); addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true); addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false); fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false); } } fieldInfos.write(directory, segment + ".fnm"); int docCount = 0; if (mergeDocStores) { // If the i'th reader is a SegmentReader and has // identical fieldName -> number mapping, then this // array will be non-null at position i: SegmentReader[] matchingSegmentReaders = new SegmentReader[readers.size()]; // If this reader is a SegmentReader, and all of its // field name -> number mappings match the "merged" // FieldInfos, then we can do a bulk copy of the // stored fields: for (int i = 0; i < readers.size(); i++) { IndexReader reader = (IndexReader) readers.elementAt(i); if (reader instanceof SegmentReader) { SegmentReader segmentReader = (SegmentReader) reader; boolean same = true; FieldInfos segmentFieldInfos = segmentReader.getFieldInfos(); for (int j = 0; same && j < segmentFieldInfos.size(); j++) same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j)); if (same) { matchingSegmentReaders[i] = segmentReader; } } } // Used for bulk-reading raw bytes for stored fields final int[] rawDocLengths = new int[MAX_RAW_MERGE_DOCS]; // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're // in merge mode, we use this FieldSelector FieldSelector fieldSelectorMerge = new FieldSelector() { public FieldSelectorResult accept(String fieldName) { return FieldSelectorResult.LOAD_FOR_MERGE; } }; // merge field values final FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos); try { for (int i = 0; i < readers.size(); i++) { final IndexReader reader = (IndexReader) readers.elementAt(i); final SegmentReader matchingSegmentReader = matchingSegmentReaders[i]; final FieldsReader matchingFieldsReader; if (matchingSegmentReader != null) matchingFieldsReader = matchingSegmentReader.getFieldsReader(); else matchingFieldsReader = null; final int maxDoc = reader.maxDoc(); for (int j = 0; j < maxDoc;) { if (!reader.isDeleted(j)) { // skip deleted docs if (matchingSegmentReader != null) { // We can optimize this case (doing a bulk // byte copy) since the field numbers are // identical int start = j; int numDocs = 0; do { j++; numDocs++; } while(j < maxDoc && !matchingSegmentReader.isDeleted(j) && numDocs < MAX_RAW_MERGE_DOCS); IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, start, numDocs); fieldsWriter.addRawDocuments(stream, rawDocLengths, numDocs); docCount += numDocs; if (checkAbort != null)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -