📄 onepassdataindexer.java~8~
字号:
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2001 Jason Baldridge and Gann Bierner
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////
package opennlp.maxent;
import gnu.trove.*;
import java.util.*;
/**
* An indexer for maxent model data which handles cutoffs for uncommon
* contextual predicates and provides a unique integer index for each of the
* predicates. The data structures built in the constructor of this class are
* used by the GIS trainer.
*
* @author Jason Baldridge
* @version $Revision: 1.1 $, $Date: 2003/12/13 16:41:29 $
*/
public class OnePassDataIndexer extends AbstractDataIndexer {
/**
* One argument constructor for DataIndexer which calls the two argument
* constructor assuming no cutoff.
*
* @param eventStream An Event[] which contains the a list of all the Events
* seen in the training data.
*/
public OnePassDataIndexer(EventStream eventStream) {
this(eventStream, 0);
}
/**
* Two argument constructor for DataIndexer.
* eventStream为训练数据的事件集,cutoff为特征起作用的最小次数
* @param eventStream An Event[] which contains the a list of all the Events
* seen in the training data.
* @param cutoff The minimum number of times a predicate must have been
* observed in order to be included in the model.
*/
//自定义修改的EventStream可能要在此修改?
public OnePassDataIndexer(EventStream eventStream, int cutoff) {
TObjectIntHashMap predicateIndex;
TDoubleIntHashMap predicateWeightIndex;
TLinkedList events;
List eventsToCompare;
predicateIndex = new TObjectIntHashMap();
predicateWeightIndex = new TDoubleIntHashMap();
System.out.println("\nIndexing events using cutoff of " + cutoff + "\n");
System.out.print("\tComputing event counts... ");
events = computeEventCounts(eventStream, predicateIndex,
predicateWeightIndex, cutoff);
System.out.println("done. " + events.size() + " events");
// 对events的计算是正确的,下面作修改
System.out.print("\tIndexing... ");
eventsToCompare = index(events, predicateIndex, predicateWeightIndex);
// done with event list 以上部分正确
events = null;
// done with predicates
predicateIndex = null;
predicateWeightIndex = null;
System.out.println("done.");
System.out.print("Sorting and merging events... ");
sortAndMerge(eventsToCompare);
System.out.println("Done indexing.");
}
/**
* Reads events from <tt>eventStream</tt> into a linked list. The
* predicates associated with each event are counted and any which
* occur at least <tt>cutoff</tt> times are added to the
* <tt>predicatesInOut</tt> map along with a unique integer index.
* 从事件流中把事件读入链表,计算每个事件的断言个数(>=cutoff),加入到HASH表中
* @param eventStream an <code>EventStream</code> value
* @param predicatesInOut a <code>TObjectIntHashMap</code> value
* @param cutoff an <code>int</code> value
* @return a <code>TLinkedList</code> value
*/
private TLinkedList computeEventCounts(EventStream eventStream,
TObjectIntHashMap predicatesInOut,
TDoubleIntHashMap
predicatesWeightInOut,
int cutoff) {
TObjectIntHashMap counter = new TObjectIntHashMap();
TLinkedList events = new TLinkedList();
int predicateIndex = 0;
int predicateWeightIndex = 0;
while (eventStream.hasNext()) {
Event ev = eventStream.nextEvent(); //取得下一个事件
events.addLast(ev); //加到事件表中
// 分析当前事件的各个断言是否已经出现在索引表中
Predicate[] preds = ev.getMailContext();
for (int j = 0; j < preds.length; j++) {
// 处理断言字符串的索引表
if (!predicatesInOut.containsKey(preds[j].word)) { //没有出现,应该加入
if (counter.increment(preds[j].word)) { // 次数++,只有次数大于cutoff的才加入索引表中使用
}
else {
counter.put(preds[j].word, 1); //置初始值为1
}
if (counter.get(preds[j].word) >= cutoff) { // 次数大于cutoff,加入到索引表中
predicatesInOut.put(preds[j].word, predicateIndex++);
counter.remove(preds[j].word);
}
}
//处理权重的索引表,权重不需要考虑次数,因此不需要设 counter
if (!predicatesWeightInOut.containsKey(preds[j].weight)) // 此权重没有出现在索引表里,直接加入
predicatesWeightInOut.put(preds[j].weight, predicateWeightIndex++);
}
}
predicatesInOut.trimToSize();
predicatesWeightInOut.trimToSize();
return events;
}
private List index(TLinkedList events,
TObjectIntHashMap predicateIndex,
TDoubleIntHashMap predicateWeightIndex) {
TObjectIntHashMap omap = new TObjectIntHashMap();
TIntArrayList indexedContext = new TIntArrayList();
TIntArrayList indexedWeight = new TIntArrayList();
List eventsToCompare = new ArrayList(events.size());
int numEvents = events.size();
int outcomeCount = 0;
int predCount = 0;
for (int eventIndex = 0; eventIndex < numEvents; eventIndex++) {
Event ev = (Event) events.removeFirst();
Predicate[] Preds = ev.getMailContext();
ComparableEvent ce;
int predID, ocID;
String oc = ev.getOutcome();
// 处理输出结果
if (omap.containsKey(oc)) {
ocID = omap.get(oc);
}
else {
ocID = outcomeCount++;
omap.put(oc, ocID);
}
// 处理正文
for (int i = 0; i < Preds.length; i++) {
String pred = Preds[i].word;
double weight = Preds[i].weight;
// if (predicateIndex.containsKey(pred)) {
indexedContext.add(predicateIndex.get(pred));
indexedWeight.add(predicateWeightIndex.get(weight));
// }
}
// drop events with no active features
if (indexedContext.size() > 0) {
ce = new ComparableEvent(ocID, indexedContext.toNativeArray(),
indexedWeight.toNativeArray());
eventsToCompare.add(ce);
}
else {
System.err.println("Dropped event " + ev.getOutcome() + ":" +
Arrays.asList(ev.getMailContext()));
}
// recycle the TIntArrayList
indexedContext.resetQuick();
indexedWeight.resetQuick();
}
outcomeLabels = toIndexedStringArray(omap);
predLabels = toIndexedStringArray(predicateIndex);
weightLabels = toIndexedStringArray(predicateWeightIndex);
return eventsToCompare;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -