📄 indexwriter.java

📁 一套java版本的搜索引擎源码
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
   */  public synchronized void deleteDocuments(Term term) throws IOException {    bufferDeleteTerm(term);    maybeFlushRamSegments();  }  /**   * Deletes the document(s) containing any of the   * terms. All deletes are flushed at the same time.   * @param terms array of terms to identify the documents   * to be deleted   */  public synchronized void deleteDocuments(Term[] terms) throws IOException {    for (int i = 0; i < terms.length; i++) {      bufferDeleteTerm(terms[i]);    }    maybeFlushRamSegments();  }  /**   * Updates a document by first deleting the document(s)   * containing <code>term</code> and then adding the new   * document.  The delete and then add are atomic as seen   * by a reader on the same index (flush may happen only after   * the add).   * @param term the term to identify the document(s) to be   * deleted   * @param doc the document to be added   */  public void updateDocument(Term term, Document doc) throws IOException {    updateDocument(term, doc, getAnalyzer());  }  /**   * Updates a document by first deleting the document(s)   * containing <code>term</code> and then adding the new   * document.  The delete and then add are atomic as seen   * by a reader on the same index (flush may happen only after   * the add).   * @param term the term to identify the document(s) to be   * deleted   * @param doc the document to be added   * @param analyzer the analyzer to use when analyzing the document   */  public void updateDocument(Term term, Document doc, Analyzer analyzer)      throws IOException {    SegmentInfo newSegmentInfo = buildSingleDocSegment(doc, analyzer);    synchronized (this) {      bufferDeleteTerm(term);      ramSegmentInfos.addElement(newSegmentInfo);      maybeFlushRamSegments();    }  }  final synchronized String newRamSegmentName() {    return "_ram_" + Integer.toString(ramSegmentInfos.counter++, Character.MAX_RADIX);  }  // for test purpose  final synchronized int getSegmentCount(){    return segmentInfos.size();  }  // for test purpose  final synchronized int getRamSegmentCount(){    return ramSegmentInfos.size();  }  // for test purpose  final synchronized int getDocCount(int i) {    if (i >= 0 && i < segmentInfos.size()) {      return segmentInfos.info(i).docCount;    } else {      return -1;    }  }  final synchronized String newSegmentName() {    return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX);  }  /** Determines how often segment indices are merged by addDocument().  With   * smaller values, less RAM is used while indexing, and searches on   * unoptimized indices are faster, but indexing speed is slower.  With larger   * values, more RAM is used during indexing, and while searches on unoptimized   * indices are slower, indexing is faster.  Thus larger values (> 10) are best   * for batch index creation, and smaller values (< 10) for indices that are   * interactively maintained.   *   * <p>This must never be less than 2.  The default value is {@link #DEFAULT_MERGE_FACTOR}.   */  private int mergeFactor = DEFAULT_MERGE_FACTOR;  /** Determines the minimal number of documents required before the buffered   * in-memory documents are merging and a new Segment is created.   * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory},   * large value gives faster indexing.  At the same time, mergeFactor limits   * the number of files open in a FSDirectory.   *   * <p> The default value is {@link #DEFAULT_MAX_BUFFERED_DOCS}.   */  private int minMergeDocs = DEFAULT_MAX_BUFFERED_DOCS;  /** Determines the largest number of documents ever merged by addDocument().   * Small values (e.g., less than 10,000) are best for interactive indexing,   * as this limits the length of pauses while indexing to a few seconds.   * Larger values are best for batched indexing and speedier searches.   *   * <p>The default value is {@link #DEFAULT_MAX_MERGE_DOCS}.   */  private int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;  /** If non-null, information about merges will be printed to this.   */  private PrintStream infoStream = null;  /** Merges all segments together into a single segment,   * optimizing an index for search.   *    * <p>Note that this requires substantial temporary free   * space in the Directory (see <a target="_top"   * href="http://issues.apache.org/jira/browse/LUCENE-764">LUCENE-764</a>   * for details):</p>   *   * <ul>   * <li>   *    * <p>If no readers/searchers are open against the index,   * then free space required is up to 1X the total size of   * the starting index.  For example, if the starting   * index is 10 GB, then you must have up to 10 GB of free   * space before calling optimize.</p>   *   * <li>   *    * <p>If readers/searchers are using the index, then free   * space required is up to 2X the size of the starting   * index.  This is because in addition to the 1X used by   * optimize, the original 1X of the starting index is   * still consuming space in the Directory as the readers   * are holding the segments files open.  Even on Unix,   * where it will appear as if the files are gone ("ls"   * won't list them), they still consume storage due to   * "delete on last close" semantics.</p>   *    * <p>Furthermore, if some but not all readers re-open   * while the optimize is underway, this will cause > 2X   * temporary space to be consumed as those new readers   * will then hold open the partially optimized segments at   * that time.  It is best not to re-open readers while   * optimize is running.</p>   *   * </ul>   *   * <p>The actual temporary usage could be much less than   * these figures (it depends on many factors).</p>   *   * <p>Once the optimize completes, the total size of the   * index will be less than the size of the starting index.   * It could be quite a bit smaller (if there were many   * pending deletes) or just slightly smaller.</p>   *   * <p>If an Exception is hit during optimize(), for example   * due to disk full, the index will not be corrupt and no   * documents will have been lost.  However, it may have   * been partially optimized (some segments were merged but   * not all), and it's possible that one of the segments in   * the index will be in non-compound format even when   * using compound file format.  This will occur when the   * Exception is hit during conversion of the segment into   * compound format.</p>  */  public synchronized void optimize() throws IOException {    flushRamSegments();    while (segmentInfos.size() > 1 ||           (segmentInfos.size() == 1 &&            (SegmentReader.hasDeletions(segmentInfos.info(0)) ||             SegmentReader.hasSeparateNorms(segmentInfos.info(0)) ||             segmentInfos.info(0).dir != directory ||             (useCompoundFile &&              (!SegmentReader.usesCompoundFile(segmentInfos.info(0))))))) {      int minSegment = segmentInfos.size() - mergeFactor;      mergeSegments(segmentInfos, minSegment < 0 ? 0 : minSegment, segmentInfos.size());    }  }  /*   * Begin a transaction.  During a transaction, any segment   * merges that happen (or ram segments flushed) will not   * write a new segments file and will not remove any files   * that were present at the start of the transaction.  You   * must make a matched (try/finall) call to   * commitTransaction() or rollbackTransaction() to finish   * the transaction.   */  private void startTransaction() throws IOException {    if (inTransaction) {      throw new IOException("transaction is already in process");    }    rollbackSegmentInfos = (SegmentInfos) segmentInfos.clone();    protectedSegments = new HashSet();    for(int i=0;i<segmentInfos.size();i++) {      SegmentInfo si = (SegmentInfo) segmentInfos.elementAt(i);      protectedSegments.add(si.name);    }    inTransaction = true;  }  /*   * Rolls back the transaction and restores state to where   * we were at the start.   */  private void rollbackTransaction() throws IOException {    // Keep the same segmentInfos instance but replace all    // of its SegmentInfo instances.  This is so the next    // attempt to commit using this instance of IndexWriter    // will always write to a new generation ("write once").    segmentInfos.clear();    segmentInfos.addAll(rollbackSegmentInfos);    // Ask deleter to locate unreferenced files & remove    // them:    deleter.clearPendingFiles();    deleter.findDeletableFiles();    deleter.deleteFiles();    clearTransaction();  }  /*   * Commits the transaction.  This will write the new   * segments file and remove and pending deletions we have   * accumulated during the transaction   */  private void commitTransaction() throws IOException {    if (commitPending) {      boolean success = false;      try {        // If we hit eg disk full during this write we have        // to rollback.:        segmentInfos.write(directory);         // commit changes        success = true;      } finally {        if (!success) {          rollbackTransaction();        }      }      deleter.commitPendingFiles();      commitPending = false;    }    clearTransaction();  }  /* Should only be called by rollbackTransaction &   * commitTransaction */  private void clearTransaction() {    protectedSegments = null;    rollbackSegmentInfos = null;    inTransaction = false;  }  /** Merges all segments from an array of indexes into this index.   *   * <p>This may be used to parallelize batch indexing.  A large document   * collection can be broken into sub-collections.  Each sub-collection can be   * indexed in parallel, on a different thread, process or machine.  The   * complete index can then be created by merging sub-collection indexes   * with this method.   *   * <p>After this completes, the index is optimized.   *   * <p>This method is transactional in how Exceptions are   * handled: it does not commit a new segments_N file until   * all indexes are added.  This means if an Exception   * occurs (for example disk full), then either no indexes   * will have been added or they all will have been.</p>   *   * <p>If an Exception is hit, it's still possible that all   * indexes were successfully added.  This happens when the   * Exception is hit when trying to build a CFS file.  In   * this case, one segment in the index will be in non-CFS   * format, even when using compound file format.</p>   *   * <p>Also note that on an Exception, the index may still   * have been partially or fully optimized even though none   * of the input indexes were added. </p>   *   * <p>Note that this requires temporary free space in the   * Directory up to 2X the sum of all input indexes   * (including the starting index).  If readers/searchers   * are open against the starting index, then temporary   * free space required will be higher by the size of the   * starting index (see {@link #optimize()} for details).   * </p>   *   * <p>Once this completes, the final size of the index   * will be less than the sum of all input index sizes   * (including the starting index).  It could be quite a   * bit smaller (if there were many pending deletes) or   * just slightly smaller.</p>   *   * <p>See <a target="_top"   * href="http://issues.apache.org/jira/browse/LUCENE-702">LUCENE-702</a>   * for details.</p>   */  public synchronized void addIndexes(Directory[] dirs)    throws IOException {    optimize();					  // start with zero or 1 seg    int start = segmentInfos.size();    boolean success = false;    startTransaction();    try {      for (int i = 0; i < dirs.length; i++) {        SegmentInfos sis = new SegmentInfos();	  // read infos from dir        sis.read(dirs[i]);        for (int j = 0; j < sis.size(); j++) {          segmentInfos.addElement(sis.info(j));	  // add each info        }      }      // merge newly added segments in log(n) passes      while (segmentInfos.size() > start+mergeFactor) {        for (int base = start; base < segmentInfos.size(); base++) {          int end = Math.min(segmentInfos.size(), base+mergeFactor);          if (end-base > 1) {            mergeSegments(segmentInfos, base, end);          }        }      }      success = true;    } finally {      if (success) {        commitTransaction();      } else {        rollbackTransaction();      }    }    optimize();					  // final cleanup  }  /**
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -