📄 documentwriter.java
字号:
ti.payloads = newPayloads; } } ti.positions[freq] = position; // add new position if (payload != null) { if (ti.payloads == null) { // lazily allocate payload array ti.payloads = new Payload[ti.positions.length]; } ti.payloads[freq] = payload; } if (offset != null) { if (ti.offsets.length == freq){ TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2]; TermVectorOffsetInfo [] offsets = ti.offsets; System.arraycopy(offsets, 0, newOffsets, 0, freq); ti.offsets = newOffsets; } ti.offsets[freq] = offset; } ti.freq = freq + 1; // update frequency } else { // word not seen before Term term = new Term(field, text, false); postingTable.put(term, new Posting(term, position, payload, offset)); } } private final Posting[] sortPostingTable() { // copy postingTable into an array Posting[] array = new Posting[postingTable.size()]; Enumeration postings = postingTable.elements(); for (int i = 0; postings.hasMoreElements(); i++) array[i] = (Posting) postings.nextElement(); // sort the array quickSort(array, 0, array.length - 1); return array; } private static final void quickSort(Posting[] postings, int lo, int hi) { if (lo >= hi) return; int mid = (lo + hi) / 2; if (postings[lo].term.compareTo(postings[mid].term) > 0) { Posting tmp = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp; } if (postings[mid].term.compareTo(postings[hi].term) > 0) { Posting tmp = postings[mid]; postings[mid] = postings[hi]; postings[hi] = tmp; if (postings[lo].term.compareTo(postings[mid].term) > 0) { Posting tmp2 = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp2; } } int left = lo + 1; int right = hi - 1; if (left >= right) return; Term partition = postings[mid].term; for (; ;) { while (postings[right].term.compareTo(partition) > 0) --right; while (left < right && postings[left].term.compareTo(partition) <= 0) ++left; if (left < right) { Posting tmp = postings[left]; postings[left] = postings[right]; postings[right] = tmp; --right; } else { break; } } quickSort(postings, lo, left); quickSort(postings, left + 1, hi); } private final void writePostings(Posting[] postings, String segment) throws CorruptIndexException, IOException { IndexOutput freq = null, prox = null; TermInfosWriter tis = null; TermVectorsWriter termVectorWriter = null; try { //open files for inverse index storage freq = directory.createOutput(segment + ".frq"); prox = directory.createOutput(segment + ".prx"); tis = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); TermInfo ti = new TermInfo(); String currentField = null; boolean currentFieldHasPayloads = false; for (int i = 0; i < postings.length; i++) { Posting posting = postings[i]; // check to see if we switched to a new field String termField = posting.term.field(); if (currentField != termField) { // changing field - see if there is something to save currentField = termField; FieldInfo fi = fieldInfos.fieldInfo(currentField); currentFieldHasPayloads = fi.storePayloads; if (fi.storeTermVector) { if (termVectorWriter == null) { termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos); termVectorWriter.openDocument(); } termVectorWriter.openField(currentField); } else if (termVectorWriter != null) { termVectorWriter.closeField(); } } // add an entry to the dictionary with pointers to prox and freq files ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1); tis.add(posting.term, ti); // add an entry to the freq file int postingFreq = posting.freq; if (postingFreq == 1) // optimize freq=1 freq.writeVInt(1); // set low bit of doc num. else { freq.writeVInt(0); // the document number freq.writeVInt(postingFreq); // frequency in doc } int lastPosition = 0; // write positions int[] positions = posting.positions; Payload[] payloads = posting.payloads; int lastPayloadLength = -1; // The following encoding is being used for positions and payloads: // Case 1: current field does not store payloads // Positions -> <PositionDelta>^freq // PositionDelta -> VInt // The PositionDelta is the difference between the current // and the previous position // Case 2: current field stores payloads // Positions -> <PositionDelta, Payload>^freq // Payload -> <PayloadLength?, PayloadData> // PositionDelta -> VInt // PayloadLength -> VInt // PayloadData -> byte^PayloadLength // In this case PositionDelta/2 is the difference between // the current and the previous position. If PositionDelta // is odd, then a PayloadLength encoded as VInt follows, // if PositionDelta is even, then it is assumed that the // length of the current Payload equals the length of the // previous Payload. for (int j = 0; j < postingFreq; j++) { // use delta-encoding int position = positions[j]; int delta = position - lastPosition; if (currentFieldHasPayloads) { int payloadLength = 0; Payload payload = null; if (payloads != null) { payload = payloads[j]; if (payload != null) { payloadLength = payload.length; } } if (payloadLength == lastPayloadLength) { // the length of the current payload equals the length // of the previous one. So we do not have to store the length // again and we only shift the position delta by one bit prox.writeVInt(delta * 2); } else { // the length of the current payload is different from the // previous one. We shift the position delta, set the lowest // bit and store the current payload length as VInt. prox.writeVInt(delta * 2 + 1); prox.writeVInt(payloadLength); lastPayloadLength = payloadLength; } if (payloadLength > 0) { // write current payload prox.writeBytes(payload.data, payload.offset, payload.length); } } else { // field does not store payloads, just write position delta as VInt prox.writeVInt(delta); } lastPosition = position; } if (termVectorWriter != null && termVectorWriter.isFieldOpen()) { termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets); } } if (termVectorWriter != null) termVectorWriter.closeDocument(); } finally { // make an effort to close all streams we can but remember and re-throw // the first exception encountered in this process IOException keep = null; if (freq != null) try { freq.close(); } catch (IOException e) { if (keep == null) keep = e; } if (prox != null) try { prox.close(); } catch (IOException e) { if (keep == null) keep = e; } if (tis != null) try { tis.close(); } catch (IOException e) { if (keep == null) keep = e; } if (termVectorWriter != null) try { termVectorWriter.close(); } catch (IOException e) { if (keep == null) keep = e; } if (keep != null) throw (IOException) keep.fillInStackTrace(); } } private final void writeNorms(String segment) throws IOException { for(int n = 0; n < fieldInfos.size(); n++){ FieldInfo fi = fieldInfos.fieldInfo(n); if(fi.isIndexed && !fi.omitNorms){ float norm = fieldBoosts[n] * similarity.lengthNorm(fi.name, fieldLengths[n]); IndexOutput norms = directory.createOutput(segment + ".f" + n); try { norms.writeByte(Similarity.encodeNorm(norm)); } finally { norms.close(); } } } } /** If non-null, a message will be printed to this if maxFieldLength is reached. */ void setInfoStream(PrintStream infoStream) { this.infoStream = infoStream; } int getNumFields() { return fieldInfos.size(); }}final class Posting { // info about a Term in a doc Term term; // the Term int freq; // its frequency in doc int[] positions; // positions it occurs at Payload[] payloads; // the payloads of the terms TermVectorOffsetInfo [] offsets; Posting(Term t, int position, Payload payload, TermVectorOffsetInfo offset) { term = t; freq = 1; positions = new int[1]; positions[0] = position; if (payload != null) { payloads = new Payload[1]; payloads[0] = payload; } else payloads = null; if(offset != null){ offsets = new TermVectorOffsetInfo[1]; offsets[0] = offset; } else offsets = null; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -