⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 termvectorswriter.java

📁 索引aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
💻 JAVA
字号:
package org.apache.lucene.index;import org.apache.lucene.store.Directory;import org.apache.lucene.store.OutputStream;import org.apache.lucene.util.StringHelper;import java.io.IOException;import java.util.Vector;/** * Writer works by opening a document and then opening the fields within the document and then * writing out the vectors for each field. *  * Rough usage: * <CODE> for each document { writer.openDocument(); for each field on the document { writer.openField(field); for all of the terms { writer.addTerm(...) } writer.closeField } writer.closeDocument()     } </CODE> */final class TermVectorsWriter {  public static final int FORMAT_VERSION = 1;  //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file   public static final int FORMAT_SIZE = 4;    //TODO: Figure out how to write with or w/o position information and read back in  public static final String TVX_EXTENSION = ".tvx";  public static final String TVD_EXTENSION = ".tvd";  public static final String TVF_EXTENSION = ".tvf";  private OutputStream tvx = null, tvd = null, tvf = null;  private Vector fields = null;  private Vector terms = null;  private FieldInfos fieldInfos;  private TVField currentField = null;  private long currentDocPointer = -1;  /** Create term vectors writer for the specified segment in specified   *  directory.  A new TermVectorsWriter should be created for each   *  segment. The parameter <code>maxFields</code> indicates how many total   *  fields are found in this document. Not all of these fields may require   *  termvectors to be stored, so the number of calls to   *  <code>openField</code> is less or equal to this number.   */  public TermVectorsWriter(Directory directory, String segment,                           FieldInfos fieldInfos)    throws IOException {    // Open files for TermVector storage    tvx = directory.createFile(segment + TVX_EXTENSION);    tvx.writeInt(FORMAT_VERSION);    tvd = directory.createFile(segment + TVD_EXTENSION);    tvd.writeInt(FORMAT_VERSION);    tvf = directory.createFile(segment + TVF_EXTENSION);    tvf.writeInt(FORMAT_VERSION);    this.fieldInfos = fieldInfos;    fields = new Vector(fieldInfos.size());    terms = new Vector();  }  public final void openDocument()          throws IOException {    closeDocument();    currentDocPointer = tvd.getFilePointer();  }  public final void closeDocument()          throws IOException {    if (isDocumentOpen()) {      closeField();      writeDoc();      fields.clear();      currentDocPointer = -1;    }  }  public final boolean isDocumentOpen() {    return currentDocPointer != -1;  }  /** Start processing a field. This can be followed by a number of calls to   *  addTerm, and a final call to closeField to indicate the end of   *  processing of this field. If a field was previously open, it is   *  closed automatically.   */  public final void openField(String field)          throws IOException {    if (!isDocumentOpen()) throw new IllegalStateException("Cannot open field when no document is open.");    closeField();    currentField = new TVField(fieldInfos.fieldNumber(field));  }  /** Finished processing current field. This should be followed by a call to   *  openField before future calls to addTerm.   */  public final void closeField()          throws IOException {    if (isFieldOpen()) {      /* DEBUG */      //System.out.println("closeField()");      /* DEBUG */      // save field and terms      writeField();      fields.add(currentField);      terms.clear();      currentField = null;    }  }  /** Return true if a field is currently open. */  public final boolean isFieldOpen() {    return currentField != null;  }  /** Add term to the field's term vector. Field must already be open   *  of NullPointerException is thrown. Terms should be added in   *  increasing order of terms, one call per unique termNum. ProxPointer   *  is a pointer into the TermPosition file (prx). Freq is the number of   *  times this term appears in this field, in this document.   */  public final void addTerm(String termText, int freq) {    if (!isDocumentOpen()) throw new IllegalStateException("Cannot add terms when document is not open");    if (!isFieldOpen()) throw new IllegalStateException("Cannot add terms when field is not open");    addTermInternal(termText, freq);  }  private final void addTermInternal(String termText, int freq) {    currentField.length += freq;    TVTerm term = new TVTerm();    term.termText = termText;    term.freq = freq;    terms.add(term);  }  /** Add specified vectors to the document.   */  public final void addVectors(TermFreqVector[] vectors)          throws IOException {    if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vectors when document is not open");    if (isFieldOpen()) throw new IllegalStateException("Cannot add term vectors when field is open");    for (int i = 0; i < vectors.length; i++) {      addTermFreqVector(vectors[i]);    }  }  /** Add specified vector to the document. Document must be open but no field   *  should be open or exception is thrown. The same document can have <code>addTerm</code>   *  and <code>addVectors</code> calls mixed, however a given field must either be   *  populated with <code>addTerm</code> or with <code>addVector</code>.     *   */  public final void addTermFreqVector(TermFreqVector vector)          throws IOException {    if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vector when document is not open");    if (isFieldOpen()) throw new IllegalStateException("Cannot add term vector when field is open");    addTermFreqVectorInternal(vector);  }  private final void addTermFreqVectorInternal(TermFreqVector vector)          throws IOException {    openField(vector.getField());    for (int i = 0; i < vector.size(); i++) {      addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i]);    }    closeField();  }       /** Close all streams. */  final void close() throws IOException {    try {      closeDocument();    } finally {      // make an effort to close all streams we can but remember and re-throw      // the first exception encountered in this process      IOException keep = null;      if (tvx != null)        try {          tvx.close();        } catch (IOException e) {          if (keep == null) keep = e;        }      if (tvd != null)        try {          tvd.close();        } catch (IOException e) {          if (keep == null) keep = e;        }      if (tvf != null)        try {          tvf.close();        } catch (IOException e) {          if (keep == null) keep = e;        }      if (keep != null) throw (IOException) keep.fillInStackTrace();    }  }    private void writeField() throws IOException {    // remember where this field is written    currentField.tvfPointer = tvf.getFilePointer();    //System.out.println("Field Pointer: " + currentField.tvfPointer);    final int size;    tvf.writeVInt(size = terms.size());    tvf.writeVInt(currentField.length - size);    String lastTermText = "";    // write term ids and positions    for (int i = 0; i < size; i++) {      TVTerm term = (TVTerm) terms.elementAt(i);      //tvf.writeString(term.termText);      int start = StringHelper.stringDifference(lastTermText, term.termText);      int length = term.termText.length() - start;      tvf.writeVInt(start);			  // write shared prefix length      tvf.writeVInt(length);			  // write delta length      tvf.writeChars(term.termText, start, length);  // write delta chars      tvf.writeVInt(term.freq);      lastTermText = term.termText;    }  }  private void writeDoc() throws IOException {    if (isFieldOpen()) throw new IllegalStateException("Field is still open while writing document");    //System.out.println("Writing doc pointer: " + currentDocPointer);    // write document index record    tvx.writeLong(currentDocPointer);    // write document data record    final int size;    // write the number of fields    tvd.writeVInt(size = fields.size());    // write field numbers    int lastFieldNumber = 0;    for (int i = 0; i < size; i++) {      TVField field = (TVField) fields.elementAt(i);      tvd.writeVInt(field.number - lastFieldNumber);      lastFieldNumber = field.number;    }    // write field pointers    long lastFieldPointer = 0;    for (int i = 0; i < size; i++) {      TVField field = (TVField) fields.elementAt(i);      tvd.writeVLong(field.tvfPointer - lastFieldPointer);      lastFieldPointer = field.tvfPointer;    }    //System.out.println("After writing doc pointer: " + tvx.getFilePointer());  }  private static class TVField {    int number;    long tvfPointer = 0;    int length = 0;   // number of distinct term positions    TVField(int number) {      this.number = number;    }  }  private static class TVTerm {    String termText;    int freq = 0;    //int positions[] = null;  }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -