📄 indexwriter.java
字号:
*/ public synchronized void deleteDocuments(Term term) throws IOException { bufferDeleteTerm(term); maybeFlushRamSegments(); } /** * Deletes the document(s) containing any of the * terms. All deletes are flushed at the same time. * @param terms array of terms to identify the documents * to be deleted */ public synchronized void deleteDocuments(Term[] terms) throws IOException { for (int i = 0; i < terms.length; i++) { bufferDeleteTerm(terms[i]); } maybeFlushRamSegments(); } /** * Updates a document by first deleting the document(s) * containing <code>term</code> and then adding the new * document. The delete and then add are atomic as seen * by a reader on the same index (flush may happen only after * the add). * @param term the term to identify the document(s) to be * deleted * @param doc the document to be added */ public void updateDocument(Term term, Document doc) throws IOException { updateDocument(term, doc, getAnalyzer()); } /** * Updates a document by first deleting the document(s) * containing <code>term</code> and then adding the new * document. The delete and then add are atomic as seen * by a reader on the same index (flush may happen only after * the add). * @param term the term to identify the document(s) to be * deleted * @param doc the document to be added * @param analyzer the analyzer to use when analyzing the document */ public void updateDocument(Term term, Document doc, Analyzer analyzer) throws IOException { SegmentInfo newSegmentInfo = buildSingleDocSegment(doc, analyzer); synchronized (this) { bufferDeleteTerm(term); ramSegmentInfos.addElement(newSegmentInfo); maybeFlushRamSegments(); } } final synchronized String newRamSegmentName() { return "_ram_" + Integer.toString(ramSegmentInfos.counter++, Character.MAX_RADIX); } // for test purpose final synchronized int getSegmentCount(){ return segmentInfos.size(); } // for test purpose final synchronized int getRamSegmentCount(){ return ramSegmentInfos.size(); } // for test purpose final synchronized int getDocCount(int i) { if (i >= 0 && i < segmentInfos.size()) { return segmentInfos.info(i).docCount; } else { return -1; } } final synchronized String newSegmentName() { return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX); } /** Determines how often segment indices are merged by addDocument(). With * smaller values, less RAM is used while indexing, and searches on * unoptimized indices are faster, but indexing speed is slower. With larger * values, more RAM is used during indexing, and while searches on unoptimized * indices are slower, indexing is faster. Thus larger values (> 10) are best * for batch index creation, and smaller values (< 10) for indices that are * interactively maintained. * * <p>This must never be less than 2. The default value is {@link #DEFAULT_MERGE_FACTOR}. */ private int mergeFactor = DEFAULT_MERGE_FACTOR; /** Determines the minimal number of documents required before the buffered * in-memory documents are merging and a new Segment is created. * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory}, * large value gives faster indexing. At the same time, mergeFactor limits * the number of files open in a FSDirectory. * * <p> The default value is {@link #DEFAULT_MAX_BUFFERED_DOCS}. */ private int minMergeDocs = DEFAULT_MAX_BUFFERED_DOCS; /** Determines the largest number of documents ever merged by addDocument(). * Small values (e.g., less than 10,000) are best for interactive indexing, * as this limits the length of pauses while indexing to a few seconds. * Larger values are best for batched indexing and speedier searches. * * <p>The default value is {@link #DEFAULT_MAX_MERGE_DOCS}. */ private int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; /** If non-null, information about merges will be printed to this. */ private PrintStream infoStream = null; /** Merges all segments together into a single segment, * optimizing an index for search. * * <p>Note that this requires substantial temporary free * space in the Directory (see <a target="_top" * href="http://issues.apache.org/jira/browse/LUCENE-764">LUCENE-764</a> * for details):</p> * * <ul> * <li> * * <p>If no readers/searchers are open against the index, * then free space required is up to 1X the total size of * the starting index. For example, if the starting * index is 10 GB, then you must have up to 10 GB of free * space before calling optimize.</p> * * <li> * * <p>If readers/searchers are using the index, then free * space required is up to 2X the size of the starting * index. This is because in addition to the 1X used by * optimize, the original 1X of the starting index is * still consuming space in the Directory as the readers * are holding the segments files open. Even on Unix, * where it will appear as if the files are gone ("ls" * won't list them), they still consume storage due to * "delete on last close" semantics.</p> * * <p>Furthermore, if some but not all readers re-open * while the optimize is underway, this will cause > 2X * temporary space to be consumed as those new readers * will then hold open the partially optimized segments at * that time. It is best not to re-open readers while * optimize is running.</p> * * </ul> * * <p>The actual temporary usage could be much less than * these figures (it depends on many factors).</p> * * <p>Once the optimize completes, the total size of the * index will be less than the size of the starting index. * It could be quite a bit smaller (if there were many * pending deletes) or just slightly smaller.</p> * * <p>If an Exception is hit during optimize(), for example * due to disk full, the index will not be corrupt and no * documents will have been lost. However, it may have * been partially optimized (some segments were merged but * not all), and it's possible that one of the segments in * the index will be in non-compound format even when * using compound file format. This will occur when the * Exception is hit during conversion of the segment into * compound format.</p> */ public synchronized void optimize() throws IOException { flushRamSegments(); while (segmentInfos.size() > 1 || (segmentInfos.size() == 1 && (SegmentReader.hasDeletions(segmentInfos.info(0)) || SegmentReader.hasSeparateNorms(segmentInfos.info(0)) || segmentInfos.info(0).dir != directory || (useCompoundFile && (!SegmentReader.usesCompoundFile(segmentInfos.info(0))))))) { int minSegment = segmentInfos.size() - mergeFactor; mergeSegments(segmentInfos, minSegment < 0 ? 0 : minSegment, segmentInfos.size()); } } /* * Begin a transaction. During a transaction, any segment * merges that happen (or ram segments flushed) will not * write a new segments file and will not remove any files * that were present at the start of the transaction. You * must make a matched (try/finall) call to * commitTransaction() or rollbackTransaction() to finish * the transaction. */ private void startTransaction() throws IOException { if (inTransaction) { throw new IOException("transaction is already in process"); } rollbackSegmentInfos = (SegmentInfos) segmentInfos.clone(); protectedSegments = new HashSet(); for(int i=0;i<segmentInfos.size();i++) { SegmentInfo si = (SegmentInfo) segmentInfos.elementAt(i); protectedSegments.add(si.name); } inTransaction = true; } /* * Rolls back the transaction and restores state to where * we were at the start. */ private void rollbackTransaction() throws IOException { // Keep the same segmentInfos instance but replace all // of its SegmentInfo instances. This is so the next // attempt to commit using this instance of IndexWriter // will always write to a new generation ("write once"). segmentInfos.clear(); segmentInfos.addAll(rollbackSegmentInfos); // Ask deleter to locate unreferenced files & remove // them: deleter.clearPendingFiles(); deleter.findDeletableFiles(); deleter.deleteFiles(); clearTransaction(); } /* * Commits the transaction. This will write the new * segments file and remove and pending deletions we have * accumulated during the transaction */ private void commitTransaction() throws IOException { if (commitPending) { boolean success = false; try { // If we hit eg disk full during this write we have // to rollback.: segmentInfos.write(directory); // commit changes success = true; } finally { if (!success) { rollbackTransaction(); } } deleter.commitPendingFiles(); commitPending = false; } clearTransaction(); } /* Should only be called by rollbackTransaction & * commitTransaction */ private void clearTransaction() { protectedSegments = null; rollbackSegmentInfos = null; inTransaction = false; } /** Merges all segments from an array of indexes into this index. * * <p>This may be used to parallelize batch indexing. A large document * collection can be broken into sub-collections. Each sub-collection can be * indexed in parallel, on a different thread, process or machine. The * complete index can then be created by merging sub-collection indexes * with this method. * * <p>After this completes, the index is optimized. * * <p>This method is transactional in how Exceptions are * handled: it does not commit a new segments_N file until * all indexes are added. This means if an Exception * occurs (for example disk full), then either no indexes * will have been added or they all will have been.</p> * * <p>If an Exception is hit, it's still possible that all * indexes were successfully added. This happens when the * Exception is hit when trying to build a CFS file. In * this case, one segment in the index will be in non-CFS * format, even when using compound file format.</p> * * <p>Also note that on an Exception, the index may still * have been partially or fully optimized even though none * of the input indexes were added. </p> * * <p>Note that this requires temporary free space in the * Directory up to 2X the sum of all input indexes * (including the starting index). If readers/searchers * are open against the starting index, then temporary * free space required will be higher by the size of the * starting index (see {@link #optimize()} for details). * </p> * * <p>Once this completes, the final size of the index * will be less than the sum of all input index sizes * (including the starting index). It could be quite a * bit smaller (if there were many pending deletes) or * just slightly smaller.</p> * * <p>See <a target="_top" * href="http://issues.apache.org/jira/browse/LUCENE-702">LUCENE-702</a> * for details.</p> */ public synchronized void addIndexes(Directory[] dirs) throws IOException { optimize(); // start with zero or 1 seg int start = segmentInfos.size(); boolean success = false; startTransaction(); try { for (int i = 0; i < dirs.length; i++) { SegmentInfos sis = new SegmentInfos(); // read infos from dir sis.read(dirs[i]); for (int j = 0; j < sis.size(); j++) { segmentInfos.addElement(sis.info(j)); // add each info } } // merge newly added segments in log(n) passes while (segmentInfos.size() > start+mergeFactor) { for (int base = start; base < segmentInfos.size(); base++) { int end = Math.min(segmentInfos.size(), base+mergeFactor); if (end-base > 1) { mergeSegments(segmentInfos, base, end); } } } success = true; } finally { if (success) { commitTransaction(); } else { rollbackTransaction(); } } optimize(); // final cleanup } /**
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -