📄 checkindex.java
字号:
package org.apache.lucene.index;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.Directory;import org.apache.lucene.store.IndexInput;import org.apache.lucene.document.Document;import java.text.NumberFormat;import java.io.PrintStream;import java.io.IOException;import java.util.Collection;import java.util.Iterator;/** * Basic tool to check the health of an index and write a * new segments file that removes reference to problematic * segments. There are many more checks that this tool * could do but does not yet, eg: reconstructing a segments * file by looking for all loadable segments (if no segments * file is found), removing specifically specified segments, * listing files that exist but are not referenced, etc. */public class CheckIndex { public static PrintStream out = System.out; private static class MySegmentTermDocs extends SegmentTermDocs { int delCount; MySegmentTermDocs(SegmentReader p) { super(p); } public void seek(Term term) throws IOException { super.seek(term); delCount = 0; } protected void skippingDoc() throws IOException { delCount++; } } /** Returns true if index is clean, else false.*/ public static boolean check(Directory dir, boolean doFix) throws IOException { NumberFormat nf = NumberFormat.getInstance(); SegmentInfos sis = new SegmentInfos(); try { sis.read(dir); } catch (Throwable t) { out.println("ERROR: could not read any segments file in directory"); t.printStackTrace(out); return false; } final int numSegments = sis.size(); final String segmentsFileName = sis.getCurrentSegmentFileName(); IndexInput input = null; try { input = dir.openInput(segmentsFileName); } catch (Throwable t) { out.println("ERROR: could not open segments file in directory"); t.printStackTrace(out); return false; } int format = 0; try { format = input.readInt(); } catch (Throwable t) { out.println("ERROR: could not read segment file version in directory"); t.printStackTrace(out); return false; } finally { if (input != null) input.close(); } String sFormat = ""; boolean skip = false; if (format == SegmentInfos.FORMAT) sFormat = "FORMAT [Lucene Pre-2.1]"; if (format == SegmentInfos.FORMAT_LOCKLESS) sFormat = "FORMAT_LOCKLESS [Lucene 2.1]"; else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE) sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]"; else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE) sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]"; else if (format < SegmentInfos.FORMAT_SHARED_DOC_STORE) { sFormat = "int=" + format + " [newer version of Lucene than this tool]"; skip = true; } else { sFormat = format + " [Lucene 1.3 or prior]"; } out.println("Segments file=" + segmentsFileName + " numSegments=" + numSegments + " version=" + sFormat); if (skip) { out.println("\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting"); return false; } SegmentInfos newSIS = (SegmentInfos) sis.clone(); newSIS.clear(); boolean changed = false; int totLoseDocCount = 0; int numBadSegments = 0; for(int i=0;i<numSegments;i++) { final SegmentInfo info = sis.info(i); out.println(" " + (1+i) + " of " + numSegments + ": name=" + info.name + " docCount=" + info.docCount); int toLoseDocCount = info.docCount; SegmentReader reader = null; try { out.println(" compound=" + info.getUseCompoundFile()); out.println(" numFiles=" + info.files().size()); out.println(" size (MB)=" + nf.format(info.sizeInBytes()/(1024.*1024.))); final int docStoreOffset = info.getDocStoreOffset(); if (docStoreOffset != -1) { out.println(" docStoreOffset=" + docStoreOffset); out.println(" docStoreSegment=" + info.getDocStoreSegment()); out.println(" docStoreIsCompoundFile=" + info.getDocStoreIsCompoundFile()); } final String delFileName = info.getDelFileName(); if (delFileName == null) out.println(" no deletions"); else out.println(" has deletions [delFileName=" + delFileName + "]"); out.print(" test: open reader........."); reader = SegmentReader.get(info); final int numDocs = reader.numDocs(); toLoseDocCount = numDocs; if (reader.hasDeletions()) out.println("OK [" + (info.docCount - numDocs) + " deleted docs]"); else out.println("OK"); out.print(" test: fields, norms......."); Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL); Iterator it = fieldNames.iterator(); while(it.hasNext()) { final String fieldName = (String) it.next(); byte[] b = reader.norms(fieldName); if (b.length != info.docCount) throw new RuntimeException("norms for field \"" + fieldName + "\" is length " + b.length + " != maxDoc " + info.docCount); } out.println("OK [" + fieldNames.size() + " fields]"); out.print(" test: terms, freq, prox..."); final TermEnum termEnum = reader.terms(); final TermPositions termPositions = reader.termPositions(); // Used only to count up # deleted docs for this // term final MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader); long termCount = 0; long totFreq = 0; long totPos = 0; while(termEnum.next()) { termCount++; final Term term = termEnum.term(); final int docFreq = termEnum.docFreq(); termPositions.seek(term); int lastDoc = -1; int freq0 = 0; totFreq += docFreq; while(termPositions.next()) { freq0++; final int doc = termPositions.doc(); final int freq = termPositions.freq(); if (doc <= lastDoc) throw new RuntimeException("term " + term + ": doc " + doc + " < lastDoc " + lastDoc); lastDoc = doc; if (freq <= 0) throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds"); int lastPos = -1; totPos += freq; for(int j=0;j<freq;j++) { final int pos = termPositions.nextPosition(); if (pos < -1) throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds"); if (pos < lastPos) throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos); } } // Now count how many deleted docs occurred in // this term: final int delCount; if (reader.hasDeletions()) { myTermDocs.seek(term); while(myTermDocs.next()) { } delCount = myTermDocs.delCount; } else delCount = 0; if (freq0 + delCount != docFreq) throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount); } out.println("OK [" + termCount + " terms; " + totFreq + " terms/docs pairs; " + totPos + " tokens]"); out.print(" test: stored fields......."); int docCount = 0; long totFields = 0; for(int j=0;j<info.docCount;j++) if (!reader.isDeleted(j)) { docCount++; Document doc = reader.document(j); totFields += doc.getFields().size(); } if (docCount != reader.numDocs()) throw new RuntimeException("docCount=" + docCount + " but saw " + docCount + " undeleted docs"); out.println("OK [" + totFields + " total field count; avg " + nf.format((((float) totFields)/docCount)) + " fields per doc]"); out.print(" test: term vectors........"); int totVectors = 0; for(int j=0;j<info.docCount;j++) if (!reader.isDeleted(j)) { TermFreqVector[] tfv = reader.getTermFreqVectors(j); if (tfv != null) totVectors += tfv.length; } out.println("OK [" + totVectors + " total vector count; avg " + nf.format((((float) totVectors)/docCount)) + " term/freq vector fields per doc]"); out.println(""); } catch (Throwable t) { out.println("FAILED"); String comment; if (doFix) comment = "will remove reference to this segment (-fix is specified)"; else comment = "would remove reference to this segment (-fix was not specified)"; out.println(" WARNING: " + comment + "; full exception:"); t.printStackTrace(out); out.println(""); totLoseDocCount += toLoseDocCount; numBadSegments++; changed = true; continue; } finally { if (reader != null) reader.close(); } // Keeper newSIS.add(info.clone()); } if (!changed) { out.println("No problems were detected with this index.\n"); return true; } else { out.println("WARNING: " + numBadSegments + " broken segments detected"); if (doFix) out.println("WARNING: " + totLoseDocCount + " documents will be lost"); else out.println("WARNING: " + totLoseDocCount + " documents would be lost if -fix were specified"); out.println(); } if (doFix) { out.println("NOTE: will write new segments file in 5 seconds; this will remove " + totLoseDocCount + " docs from the index. THIS IS YOUR LAST CHANCE TO CTRL+C!"); for(int i=0;i<5;i++) { try { Thread.sleep(1000); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); i--; continue; } out.println(" " + (5-i) + "..."); } out.print("Writing..."); try { newSIS.write(dir); } catch (Throwable t) { out.println("FAILED; exiting"); t.printStackTrace(out); return false; } out.println("OK"); out.println("Wrote new segments file \"" + newSIS.getCurrentSegmentFileName() + "\""); } else { out.println("NOTE: would write new segments file [-fix was not specified]"); } out.println(""); return false; } static boolean assertsOn; private static boolean testAsserts() { assertsOn = true; return true; } public static void main(String[] args) throws Throwable { boolean doFix = false; for(int i=0;i<args.length;i++) if (args[i].equals("-fix")) { doFix = true; break; } if (args.length != (doFix ? 2:1)) { out.println("\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-fix]\n" + "\n" + " -fix: actually write a new segments_N file, removing any problematic segments\n" + "\n" + "**WARNING**: -fix should only be used on an emergency basis as it will cause\n" + "documents (perhaps many) to be permanently removed from the index. Always make\n" + "a backup copy of your index before running this! Do not run this tool on an index\n" + "that is actively being written to. You have been warned!\n" + "\n" + "Run without -fix, this tool will open the index, report version information\n" + "and report any exceptions it hits and what action it would take if -fix were\n" + "specified. With -fix, this tool will remove any segments that have issues and\n" + "write a new segments_N file. This means all documents contained in the affected\n" + "segments will be removed.\n" + "\n" + "This tool exits with exit code 1 if the index cannot be opened or has has any\n" + "corruption, else 0.\n"); System.exit(1); } assert testAsserts(); if (!assertsOn) out.println("\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene', so assertions are enabled"); final String dirName = args[0]; out.println("\nOpening index @ " + dirName + "\n"); Directory dir = null; try { dir = FSDirectory.getDirectory(dirName); } catch (Throwable t) { out.println("ERROR: could not open directory \"" + dirName + "\"; exiting"); t.printStackTrace(out); System.exit(1); } boolean isClean = check(dir, doFix); final int exitCode; if (isClean) exitCode = 0; else exitCode = 1; System.exit(exitCode); } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -