📄 freqproxtermswriter.java

📁 lucene-2.4.0 是一个全文收索的工具包
💻 JAVA
字号:
package org.apache.lucene.index;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.store.IndexOutput;import org.apache.lucene.store.IndexInput;import org.apache.lucene.util.UnicodeUtil;import java.io.IOException;import java.util.Collection;import java.util.Collections;import java.util.Map;import java.util.ArrayList;import java.util.List;import java.util.Iterator;final class FreqProxTermsWriter extends TermsHashConsumer {  public TermsHashConsumerPerThread addThread(TermsHashPerThread perThread) {    return new FreqProxTermsWriterPerThread(perThread);  }  void createPostings(RawPostingList[] postings, int start, int count) {    final int end = start + count;    for(int i=start;i<end;i++)      postings[i] = new PostingList();  }  private static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) {    while(true) {      final char c1 = text1[pos1++];      final char c2 = text2[pos2++];      if (c1 != c2) {        if (0xffff == c2)          return 1;        else if (0xffff == c1)          return -1;        else          return c1-c2;      } else if (0xffff == c1)        return 0;    }  }  void closeDocStore(DocumentsWriter.FlushState state) {}  void abort() {}  // TODO: would be nice to factor out morme of this, eg the  // FreqProxFieldMergeState, and code to visit all Fields  // under the same FieldInfo together, up into TermsHash*.  // Other writers would presumably share alot of this...  public void flush(Map threadsAndFields, final DocumentsWriter.FlushState state) throws IOException {    // Gather all FieldData's that have postings, across all    // ThreadStates    List allFields = new ArrayList();    Iterator it = threadsAndFields.entrySet().iterator();    while(it.hasNext()) {      Map.Entry entry = (Map.Entry) it.next();      Collection fields = (Collection) entry.getValue();      Iterator fieldsIt = fields.iterator();      while(fieldsIt.hasNext()) {        FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) fieldsIt.next();        if (perField.termsHashPerField.numPostings > 0)          allFields.add(perField);      }    }    // Sort by field name    Collections.sort(allFields);    final int numAllFields = allFields.size();    final TermInfosWriter termsOut = new TermInfosWriter(state.directory,                                                         state.segmentName,                                                         fieldInfos,                                                         state.docWriter.writer.getTermIndexInterval());    final IndexOutput freqOut = state.directory.createOutput(state.segmentFileName(IndexFileNames.FREQ_EXTENSION));    final IndexOutput proxOut;    if (fieldInfos.hasProx())      proxOut = state.directory.createOutput(state.segmentFileName(IndexFileNames.PROX_EXTENSION));    else      proxOut = null;    final DefaultSkipListWriter skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval,                                                                           termsOut.maxSkipLevels,                                                                           state.numDocsInRAM, freqOut, proxOut);    int start = 0;    while(start < numAllFields) {      final FieldInfo fieldInfo = ((FreqProxTermsWriterPerField) allFields.get(start)).fieldInfo;      final String fieldName = fieldInfo.name;      int end = start+1;      while(end < numAllFields && ((FreqProxTermsWriterPerField) allFields.get(end)).fieldInfo.name.equals(fieldName))        end++;            FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end-start];      for(int i=start;i<end;i++) {        fields[i-start] = (FreqProxTermsWriterPerField) allFields.get(i);        // Aggregate the storePayload as seen by the same        // field across multiple threads        fieldInfo.storePayloads |= fields[i-start].hasPayloads;      }      // If this field has postings then add them to the      // segment      appendPostings(state, fields, termsOut, freqOut, proxOut, skipListWriter);      for(int i=0;i<fields.length;i++) {        TermsHashPerField perField = fields[i].termsHashPerField;        int numPostings = perField.numPostings;        perField.reset();        perField.shrinkHash(numPostings);        fields[i].reset();      }      start = end;    }    it = threadsAndFields.entrySet().iterator();    while(it.hasNext()) {      Map.Entry entry = (Map.Entry) it.next();      FreqProxTermsWriterPerThread perThread = (FreqProxTermsWriterPerThread) entry.getKey();      perThread.termsHashPerThread.reset(true);    }    freqOut.close();    if (proxOut != null) {      state.flushedFiles.add(state.segmentFileName(IndexFileNames.PROX_EXTENSION));      proxOut.close();    }    termsOut.close();        // Record all files we have flushed    state.flushedFiles.add(state.segmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION));    state.flushedFiles.add(state.segmentFileName(IndexFileNames.FREQ_EXTENSION));    state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_EXTENSION));    state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION));  }  final byte[] copyByteBuffer = new byte[4096];  /** Copy numBytes from srcIn to destIn */  void copyBytes(IndexInput srcIn, IndexOutput destIn, long numBytes) throws IOException {    // TODO: we could do this more efficiently (save a copy)    // because it's always from a ByteSliceReader ->    // IndexOutput    while(numBytes > 0) {      final int chunk;      if (numBytes > 4096)        chunk = 4096;      else        chunk = (int) numBytes;      srcIn.readBytes(copyByteBuffer, 0, chunk);      destIn.writeBytes(copyByteBuffer, 0, chunk);      numBytes -= chunk;    }  }  /* Walk through all unique text tokens (Posting   * instances) found in this field and serialize them   * into a single RAM segment. */  void appendPostings(final DocumentsWriter.FlushState flushState,                      FreqProxTermsWriterPerField[] fields,                      TermInfosWriter termsOut,                      IndexOutput freqOut,                      IndexOutput proxOut,                      DefaultSkipListWriter skipListWriter)    throws CorruptIndexException, IOException {    final int fieldNumber = fields[0].fieldInfo.number;    int numFields = fields.length;    final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];    for(int i=0;i<numFields;i++) {      FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]);      assert fms.field.fieldInfo == fields[0].fieldInfo;      // Should always be true      boolean result = fms.nextTerm();      assert result;    }    final int skipInterval = termsOut.skipInterval;    final boolean currentFieldOmitTf = fields[0].fieldInfo.omitTf;    // If current field omits tf then it cannot store    // payloads.  We silently drop the payloads in this case:    final boolean currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads;      FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];    while(numFields > 0) {      // Get the next term to merge      termStates[0] = mergeStates[0];      int numToMerge = 1;      for(int i=1;i<numFields;i++) {        final char[] text = mergeStates[i].text;        final int textOffset = mergeStates[i].textOffset;        final int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset);        if (cmp < 0) {          termStates[0] = mergeStates[i];          numToMerge = 1;        } else if (cmp == 0)          termStates[numToMerge++] = mergeStates[i];      }      int df = 0;      int lastPayloadLength = -1;      int lastDoc = 0;      final char[] text = termStates[0].text;      final int start = termStates[0].textOffset;      final long freqPointer = freqOut.getFilePointer();      final long proxPointer;      if (proxOut != null)        proxPointer = proxOut.getFilePointer();      else        proxPointer = 0;      skipListWriter.resetSkip();      // Now termStates has numToMerge FieldMergeStates      // which all share the same term.  Now we must      // interleave the docID streams.      while(numToMerge > 0) {                if ((++df % skipInterval) == 0) {          skipListWriter.setSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength);          skipListWriter.bufferSkip(df);        }        FreqProxFieldMergeState minState = termStates[0];        for(int i=1;i<numToMerge;i++)          if (termStates[i].docID < minState.docID)            minState = termStates[i];        final int doc = minState.docID;        final int termDocFreq = minState.termFreq;        assert doc < flushState.numDocsInRAM;        assert doc > lastDoc || df == 1;        final ByteSliceReader prox = minState.prox;        // Carefully copy over the prox + payload info,        // changing the format to match Lucene's segment        // format.        if (!currentFieldOmitTf) {          // omitTf == false so we do write positions & payload                    assert proxOut != null;          for(int j=0;j<termDocFreq;j++) {            final int code = prox.readVInt();            if (currentFieldStorePayloads) {              final int payloadLength;              if ((code & 1) != 0) {                // This position has a payload                payloadLength = prox.readVInt();              } else                payloadLength = 0;              if (payloadLength != lastPayloadLength) {                proxOut.writeVInt(code|1);                proxOut.writeVInt(payloadLength);                lastPayloadLength = payloadLength;              } else                proxOut.writeVInt(code & (~1));              if (payloadLength > 0)                copyBytes(prox, proxOut, payloadLength);            } else {              assert 0 == (code & 1);              proxOut.writeVInt(code>>1);            }          } //End for                    final int newDocCode = (doc-lastDoc)<<1;          if (1 == termDocFreq) {            freqOut.writeVInt(newDocCode|1);           } else {            freqOut.writeVInt(newDocCode);            freqOut.writeVInt(termDocFreq);          }        } else {          // omitTf==true: we store only the docs, without          // term freq, positions, payloads          freqOut.writeVInt(doc-lastDoc);        }        lastDoc = doc;        if (!minState.nextDoc()) {          // Remove from termStates          int upto = 0;          for(int i=0;i<numToMerge;i++)            if (termStates[i] != minState)              termStates[upto++] = termStates[i];          numToMerge--;          assert upto == numToMerge;          // Advance this state to the next term          if (!minState.nextTerm()) {            // OK, no more terms, so remove from mergeStates            // as well            upto = 0;            for(int i=0;i<numFields;i++)              if (mergeStates[i] != minState)                mergeStates[upto++] = mergeStates[i];            numFields--;            assert upto == numFields;          }        }      }      assert df > 0;      // Done merging this term      long skipPointer = skipListWriter.writeSkip(freqOut);      // Write term      termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));      // TODO: we could do this incrementally      UnicodeUtil.UTF16toUTF8(text, start, termsUTF8);      // TODO: we could save O(n) re-scan of the term by      // computing the shared prefix with the last term      // while during the UTF8 encoding      termsOut.add(fieldNumber,                   termsUTF8.result,                   termsUTF8.length,                   termInfo);    }  }  private final TermInfo termInfo = new TermInfo(); // minimize consing  final UnicodeUtil.UTF8Result termsUTF8 = new UnicodeUtil.UTF8Result();  void files(Collection files) {}  static final class PostingList extends RawPostingList {    int docFreq;                                    // # times this term occurs in the current doc    int lastDocID;                                  // Last docID where this term occurred    int lastDocCode;                                // Code for prior doc    int lastPosition;                               // Last position where this term occurred  }  int bytesPerPosting() {    return RawPostingList.BYTES_SIZE + 4 * DocumentsWriter.INT_NUM_BYTE;  }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -