📄 checkindex.java

📁 Lucene a java open-source SearchEngine Framework
💻 JAVA
字号:
package org.apache.lucene.index;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.Directory;import org.apache.lucene.store.IndexInput;import org.apache.lucene.document.Document;import java.text.NumberFormat;import java.io.PrintStream;import java.io.IOException;import java.util.Collection;import java.util.Iterator;/** * Basic tool to check the health of an index and write a * new segments file that removes reference to problematic * segments.  There are many more checks that this tool * could do but does not yet, eg: reconstructing a segments * file by looking for all loadable segments (if no segments * file is found), removing specifically specified segments, * listing files that exist but are not referenced, etc. */public class CheckIndex {  public static PrintStream out = System.out;  private static class MySegmentTermDocs extends SegmentTermDocs {    int delCount;    MySegmentTermDocs(SegmentReader p) {          super(p);    }    public void seek(Term term) throws IOException {      super.seek(term);      delCount = 0;    }    protected void skippingDoc() throws IOException {      delCount++;    }  }  /** Returns true if index is clean, else false.*/  public static boolean check(Directory dir, boolean doFix) throws IOException {    NumberFormat nf = NumberFormat.getInstance();    SegmentInfos sis = new SegmentInfos();        try {      sis.read(dir);    } catch (Throwable t) {      out.println("ERROR: could not read any segments file in directory");      t.printStackTrace(out);      return false;    }    final int numSegments = sis.size();    final String segmentsFileName = sis.getCurrentSegmentFileName();    IndexInput input = null;    try {      input = dir.openInput(segmentsFileName);    } catch (Throwable t) {      out.println("ERROR: could not open segments file in directory");      t.printStackTrace(out);      return false;    }    int format = 0;    try {      format = input.readInt();    } catch (Throwable t) {      out.println("ERROR: could not read segment file version in directory");      t.printStackTrace(out);      return false;    } finally {      if (input != null)        input.close();    }    String sFormat = "";    boolean skip = false;    if (format == SegmentInfos.FORMAT)      sFormat = "FORMAT [Lucene Pre-2.1]";    if (format == SegmentInfos.FORMAT_LOCKLESS)      sFormat = "FORMAT_LOCKLESS [Lucene 2.1]";    else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE)      sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]";    else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE)      sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]";    else if (format < SegmentInfos.FORMAT_SHARED_DOC_STORE) {      sFormat = "int=" + format + " [newer version of Lucene than this tool]";      skip = true;    } else {      sFormat = format + " [Lucene 1.3 or prior]";    }    out.println("Segments file=" + segmentsFileName + " numSegments=" + numSegments + " version=" + sFormat);    if (skip) {      out.println("\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting");      return false;    }    SegmentInfos newSIS = (SegmentInfos) sis.clone();    newSIS.clear();    boolean changed = false;    int totLoseDocCount = 0;    int numBadSegments = 0;    for(int i=0;i<numSegments;i++) {      final SegmentInfo info = sis.info(i);      out.println("  " + (1+i) + " of " + numSegments + ": name=" + info.name + " docCount=" + info.docCount);      int toLoseDocCount = info.docCount;      SegmentReader reader = null;      try {        out.println("    compound=" + info.getUseCompoundFile());        out.println("    numFiles=" + info.files().size());        out.println("    size (MB)=" + nf.format(info.sizeInBytes()/(1024.*1024.)));        final int docStoreOffset = info.getDocStoreOffset();        if (docStoreOffset != -1) {          out.println("    docStoreOffset=" + docStoreOffset);          out.println("    docStoreSegment=" + info.getDocStoreSegment());          out.println("    docStoreIsCompoundFile=" + info.getDocStoreIsCompoundFile());        }        final String delFileName = info.getDelFileName();        if (delFileName == null)          out.println("    no deletions");        else          out.println("    has deletions [delFileName=" + delFileName + "]");        out.print("    test: open reader.........");        reader = SegmentReader.get(info);        final int numDocs = reader.numDocs();        toLoseDocCount = numDocs;        if (reader.hasDeletions())          out.println("OK [" + (info.docCount - numDocs) + " deleted docs]");        else          out.println("OK");        out.print("    test: fields, norms.......");        Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);        Iterator it = fieldNames.iterator();        while(it.hasNext()) {          final String fieldName = (String) it.next();          byte[] b = reader.norms(fieldName);          if (b.length != info.docCount)            throw new RuntimeException("norms for field \"" + fieldName + "\" is length " + b.length + " != maxDoc " + info.docCount);        }        out.println("OK [" + fieldNames.size() + " fields]");        out.print("    test: terms, freq, prox...");        final TermEnum termEnum = reader.terms();        final TermPositions termPositions = reader.termPositions();        // Used only to count up # deleted docs for this        // term        final MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader);        long termCount = 0;        long totFreq = 0;        long totPos = 0;        while(termEnum.next()) {          termCount++;          final Term term = termEnum.term();          final int docFreq = termEnum.docFreq();          termPositions.seek(term);          int lastDoc = -1;          int freq0 = 0;          totFreq += docFreq;          while(termPositions.next()) {            freq0++;            final int doc = termPositions.doc();            final int freq = termPositions.freq();            if (doc <= lastDoc)              throw new RuntimeException("term " + term + ": doc " + doc + " < lastDoc " + lastDoc);            lastDoc = doc;            if (freq <= 0)              throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds");                        int lastPos = -1;            totPos += freq;            for(int j=0;j<freq;j++) {              final int pos = termPositions.nextPosition();              if (pos < -1)                throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");              if (pos < lastPos)                throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);            }          }          // Now count how many deleted docs occurred in          // this term:          final int delCount;          if (reader.hasDeletions()) {            myTermDocs.seek(term);            while(myTermDocs.next()) {            }            delCount = myTermDocs.delCount;          } else            delCount = 0;          if (freq0 + delCount != docFreq)            throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount);        }        out.println("OK [" + termCount + " terms; " + totFreq + " terms/docs pairs; " + totPos + " tokens]");        out.print("    test: stored fields.......");        int docCount = 0;        long totFields = 0;        for(int j=0;j<info.docCount;j++)          if (!reader.isDeleted(j)) {            docCount++;            Document doc = reader.document(j);            totFields += doc.getFields().size();          }        if (docCount != reader.numDocs())          throw new RuntimeException("docCount=" + docCount + " but saw " + docCount + " undeleted docs");        out.println("OK [" + totFields + " total field count; avg " + nf.format((((float) totFields)/docCount)) + " fields per doc]");        out.print("    test: term vectors........");        int totVectors = 0;        for(int j=0;j<info.docCount;j++)          if (!reader.isDeleted(j)) {            TermFreqVector[] tfv = reader.getTermFreqVectors(j);            if (tfv != null)              totVectors += tfv.length;          }        out.println("OK [" + totVectors + " total vector count; avg " + nf.format((((float) totVectors)/docCount)) + " term/freq vector fields per doc]");        out.println("");      } catch (Throwable t) {        out.println("FAILED");        String comment;        if (doFix)          comment = "will remove reference to this segment (-fix is specified)";        else          comment = "would remove reference to this segment (-fix was not specified)";        out.println("    WARNING: " + comment + "; full exception:");        t.printStackTrace(out);        out.println("");        totLoseDocCount += toLoseDocCount;        numBadSegments++;        changed = true;        continue;      } finally {        if (reader != null)          reader.close();      }      // Keeper      newSIS.add(info.clone());    }    if (!changed) {      out.println("No problems were detected with this index.\n");      return true;    } else {      out.println("WARNING: " + numBadSegments + " broken segments detected");      if (doFix)        out.println("WARNING: " + totLoseDocCount + " documents will be lost");      else        out.println("WARNING: " + totLoseDocCount + " documents would be lost if -fix were specified");      out.println();    }    if (doFix) {      out.println("NOTE: will write new segments file in 5 seconds; this will remove " + totLoseDocCount + " docs from the index. THIS IS YOUR LAST CHANCE TO CTRL+C!");      for(int i=0;i<5;i++) {        try {          Thread.sleep(1000);        } catch (InterruptedException ie) {          Thread.currentThread().interrupt();          i--;          continue;        }                  out.println("  " + (5-i) + "...");      }      out.print("Writing...");      try {        newSIS.write(dir);      } catch (Throwable t) {        out.println("FAILED; exiting");        t.printStackTrace(out);        return false;      }      out.println("OK");      out.println("Wrote new segments file \"" + newSIS.getCurrentSegmentFileName() + "\"");    } else {      out.println("NOTE: would write new segments file [-fix was not specified]");    }    out.println("");    return false;  }  static boolean assertsOn;  private static boolean testAsserts() {    assertsOn = true;    return true;  }  public static void main(String[] args) throws Throwable {    boolean doFix = false;    for(int i=0;i<args.length;i++)      if (args[i].equals("-fix")) {        doFix = true;        break;      }    if (args.length != (doFix ? 2:1)) {      out.println("\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-fix]\n" +                         "\n" +                         "  -fix: actually write a new segments_N file, removing any problematic segments\n" +                         "\n" +                          "**WARNING**: -fix should only be used on an emergency basis as it will cause\n" +                         "documents (perhaps many) to be permanently removed from the index.  Always make\n" +                         "a backup copy of your index before running this!  Do not run this tool on an index\n" +                         "that is actively being written to.  You have been warned!\n" +                         "\n" +                         "Run without -fix, this tool will open the index, report version information\n" +                         "and report any exceptions it hits and what action it would take if -fix were\n" +                         "specified.  With -fix, this tool will remove any segments that have issues and\n" +                          "write a new segments_N file.  This means all documents contained in the affected\n" +                         "segments will be removed.\n" +                         "\n" +                         "This tool exits with exit code 1 if the index cannot be opened or has has any\n" +                         "corruption, else 0.\n");      System.exit(1);    }    assert testAsserts();    if (!assertsOn)      out.println("\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene', so assertions are enabled");    final String dirName = args[0];    out.println("\nOpening index @ " + dirName + "\n");    Directory dir = null;    try {      dir = FSDirectory.getDirectory(dirName);    } catch (Throwable t) {      out.println("ERROR: could not open directory \"" + dirName + "\"; exiting");      t.printStackTrace(out);      System.exit(1);    }    boolean isClean = check(dir, doFix);    final int exitCode;    if (isClean)      exitCode = 0;    else      exitCode = 1;    System.exit(exitCode);  }    }
💿 文件大小 5390 K
👤 上传用户 rickie936
📂 所属分类 Java编程
🏷️ 相关标签

#SearchEngine #open-source #Framework #Lucene
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -