📄 termvectorswriter.java
字号:
package org.apache.lucene.index;import org.apache.lucene.store.Directory;import org.apache.lucene.store.OutputStream;import org.apache.lucene.util.StringHelper;import java.io.IOException;import java.util.Vector;/** * Writer works by opening a document and then opening the fields within the document and then * writing out the vectors for each field. * * Rough usage: * <CODE> for each document { writer.openDocument(); for each field on the document { writer.openField(field); for all of the terms { writer.addTerm(...) } writer.closeField } writer.closeDocument() } </CODE> */final class TermVectorsWriter { public static final int FORMAT_VERSION = 1; //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file public static final int FORMAT_SIZE = 4; //TODO: Figure out how to write with or w/o position information and read back in public static final String TVX_EXTENSION = ".tvx"; public static final String TVD_EXTENSION = ".tvd"; public static final String TVF_EXTENSION = ".tvf"; private OutputStream tvx = null, tvd = null, tvf = null; private Vector fields = null; private Vector terms = null; private FieldInfos fieldInfos; private TVField currentField = null; private long currentDocPointer = -1; /** Create term vectors writer for the specified segment in specified * directory. A new TermVectorsWriter should be created for each * segment. The parameter <code>maxFields</code> indicates how many total * fields are found in this document. Not all of these fields may require * termvectors to be stored, so the number of calls to * <code>openField</code> is less or equal to this number. */ public TermVectorsWriter(Directory directory, String segment, FieldInfos fieldInfos) throws IOException { // Open files for TermVector storage tvx = directory.createFile(segment + TVX_EXTENSION); tvx.writeInt(FORMAT_VERSION); tvd = directory.createFile(segment + TVD_EXTENSION); tvd.writeInt(FORMAT_VERSION); tvf = directory.createFile(segment + TVF_EXTENSION); tvf.writeInt(FORMAT_VERSION); this.fieldInfos = fieldInfos; fields = new Vector(fieldInfos.size()); terms = new Vector(); } public final void openDocument() throws IOException { closeDocument(); currentDocPointer = tvd.getFilePointer(); } public final void closeDocument() throws IOException { if (isDocumentOpen()) { closeField(); writeDoc(); fields.clear(); currentDocPointer = -1; } } public final boolean isDocumentOpen() { return currentDocPointer != -1; } /** Start processing a field. This can be followed by a number of calls to * addTerm, and a final call to closeField to indicate the end of * processing of this field. If a field was previously open, it is * closed automatically. */ public final void openField(String field) throws IOException { if (!isDocumentOpen()) throw new IllegalStateException("Cannot open field when no document is open."); closeField(); currentField = new TVField(fieldInfos.fieldNumber(field)); } /** Finished processing current field. This should be followed by a call to * openField before future calls to addTerm. */ public final void closeField() throws IOException { if (isFieldOpen()) { /* DEBUG */ //System.out.println("closeField()"); /* DEBUG */ // save field and terms writeField(); fields.add(currentField); terms.clear(); currentField = null; } } /** Return true if a field is currently open. */ public final boolean isFieldOpen() { return currentField != null; } /** Add term to the field's term vector. Field must already be open * of NullPointerException is thrown. Terms should be added in * increasing order of terms, one call per unique termNum. ProxPointer * is a pointer into the TermPosition file (prx). Freq is the number of * times this term appears in this field, in this document. */ public final void addTerm(String termText, int freq) { if (!isDocumentOpen()) throw new IllegalStateException("Cannot add terms when document is not open"); if (!isFieldOpen()) throw new IllegalStateException("Cannot add terms when field is not open"); addTermInternal(termText, freq); } private final void addTermInternal(String termText, int freq) { currentField.length += freq; TVTerm term = new TVTerm(); term.termText = termText; term.freq = freq; terms.add(term); } /** Add specified vectors to the document. */ public final void addVectors(TermFreqVector[] vectors) throws IOException { if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vectors when document is not open"); if (isFieldOpen()) throw new IllegalStateException("Cannot add term vectors when field is open"); for (int i = 0; i < vectors.length; i++) { addTermFreqVector(vectors[i]); } } /** Add specified vector to the document. Document must be open but no field * should be open or exception is thrown. The same document can have <code>addTerm</code> * and <code>addVectors</code> calls mixed, however a given field must either be * populated with <code>addTerm</code> or with <code>addVector</code>. * */ public final void addTermFreqVector(TermFreqVector vector) throws IOException { if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vector when document is not open"); if (isFieldOpen()) throw new IllegalStateException("Cannot add term vector when field is open"); addTermFreqVectorInternal(vector); } private final void addTermFreqVectorInternal(TermFreqVector vector) throws IOException { openField(vector.getField()); for (int i = 0; i < vector.size(); i++) { addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i]); } closeField(); } /** Close all streams. */ final void close() throws IOException { try { closeDocument(); } finally { // make an effort to close all streams we can but remember and re-throw // the first exception encountered in this process IOException keep = null; if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; } if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; } if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; } if (keep != null) throw (IOException) keep.fillInStackTrace(); } } private void writeField() throws IOException { // remember where this field is written currentField.tvfPointer = tvf.getFilePointer(); //System.out.println("Field Pointer: " + currentField.tvfPointer); final int size; tvf.writeVInt(size = terms.size()); tvf.writeVInt(currentField.length - size); String lastTermText = ""; // write term ids and positions for (int i = 0; i < size; i++) { TVTerm term = (TVTerm) terms.elementAt(i); //tvf.writeString(term.termText); int start = StringHelper.stringDifference(lastTermText, term.termText); int length = term.termText.length() - start; tvf.writeVInt(start); // write shared prefix length tvf.writeVInt(length); // write delta length tvf.writeChars(term.termText, start, length); // write delta chars tvf.writeVInt(term.freq); lastTermText = term.termText; } } private void writeDoc() throws IOException { if (isFieldOpen()) throw new IllegalStateException("Field is still open while writing document"); //System.out.println("Writing doc pointer: " + currentDocPointer); // write document index record tvx.writeLong(currentDocPointer); // write document data record final int size; // write the number of fields tvd.writeVInt(size = fields.size()); // write field numbers int lastFieldNumber = 0; for (int i = 0; i < size; i++) { TVField field = (TVField) fields.elementAt(i); tvd.writeVInt(field.number - lastFieldNumber); lastFieldNumber = field.number; } // write field pointers long lastFieldPointer = 0; for (int i = 0; i < size; i++) { TVField field = (TVField) fields.elementAt(i); tvd.writeVLong(field.tvfPointer - lastFieldPointer); lastFieldPointer = field.tvfPointer; } //System.out.println("After writing doc pointer: " + tvx.getFilePointer()); } private static class TVField { int number; long tvfPointer = 0; int length = 0; // number of distinct term positions TVField(int number) { this.number = number; } } private static class TVTerm { String termText; int freq = 0; //int positions[] = null; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -