📄 crf4.java
字号:
private boolean[][] labelConnectionsIn (InstanceList trainingSet) { return labelConnectionsIn (trainingSet, null); } private boolean[][] labelConnectionsIn (InstanceList trainingSet, String start) { int numLabels = outputAlphabet.size(); boolean[][] connections = new boolean[numLabels][numLabels]; for (int i = 0; i < trainingSet.size(); i++) { Instance instance = trainingSet.getInstance(i); FeatureSequence output = (FeatureSequence) instance.getTarget(); for (int j = 1; j < output.size(); j++) { int sourceIndex = outputAlphabet.lookupIndex (output.get(j-1)); int destIndex = outputAlphabet.lookupIndex (output.get(j)); assert (sourceIndex >= 0 && destIndex >= 0); connections[sourceIndex][destIndex] = true; } } // Handle start state if (start != null) { int startIndex = outputAlphabet.lookupIndex (start); for (int j = 0; j < outputAlphabet.size(); j++) { connections[startIndex][j] = true; } } return connections; } /** Add states to create a first-order Markov model on labels, adding only those transitions the occur in the given trainingSet. */ public void addStatesForLabelsConnectedAsIn (InstanceList trainingSet) { int numLabels = outputAlphabet.size(); boolean[][] connections = labelConnectionsIn (trainingSet); for (int i = 0; i < numLabels; i++) { int numDestinations = 0; for (int j = 0; j < numLabels; j++) if (connections[i][j]) numDestinations++; String[] destinationNames = new String[numDestinations]; int destinationIndex = 0; for (int j = 0; j < numLabels; j++) if (connections[i][j]) destinationNames[destinationIndex++] = (String)outputAlphabet.lookupObject(j); addState ((String)outputAlphabet.lookupObject(i), destinationNames); } } /** Add as many states as there are labels, but don't create separate weights for each source-destination pair of states. Instead have all the incoming transitions to a state share the same weights. */ public void addStatesForHalfLabelsConnectedAsIn (InstanceList trainingSet) { int numLabels = outputAlphabet.size(); boolean[][] connections = labelConnectionsIn (trainingSet); for (int i = 0; i < numLabels; i++) { int numDestinations = 0; for (int j = 0; j < numLabels; j++) if (connections[i][j]) numDestinations++; String[] destinationNames = new String[numDestinations]; int destinationIndex = 0; for (int j = 0; j < numLabels; j++) if (connections[i][j]) destinationNames[destinationIndex++] = (String)outputAlphabet.lookupObject(j); addState ((String)outputAlphabet.lookupObject(i), 0.0, 0.0, destinationNames, destinationNames, destinationNames); } } /** Add as many states as there are labels, but don't create separate observational-test-weights for each source-destination pair of states---instead have all the incoming transitions to a state share the same observational-feature-test weights. However, do create separate default feature for each transition, (which acts as an HMM-style transition probability). */ public void addStatesForThreeQuarterLabelsConnectedAsIn (InstanceList trainingSet) { int numLabels = outputAlphabet.size(); boolean[][] connections = labelConnectionsIn (trainingSet); for (int i = 0; i < numLabels; i++) { int numDestinations = 0; for (int j = 0; j < numLabels; j++) if (connections[i][j]) numDestinations++; String[] destinationNames = new String[numDestinations]; String[][] weightNames = new String[numDestinations][]; int destinationIndex = 0; for (int j = 0; j < numLabels; j++) if (connections[i][j]) { String labelName = (String)outputAlphabet.lookupObject(j); destinationNames[destinationIndex] = labelName; weightNames[destinationIndex] = new String[2]; // The "half-labels" will include all observational tests weightNames[destinationIndex][0] = labelName; // The "transition" weights will include only the default feature String wn = (String)outputAlphabet.lookupObject(i) + "->" + (String)outputAlphabet.lookupObject(j); weightNames[destinationIndex][1] = wn; int wi = getWeightsIndex (wn); // A new empty FeatureSelection won't allow any features here, so we only // get the default feature for transitions featureSelections[wi] = new FeatureSelection(trainingSet.getDataAlphabet()); destinationIndex++; } addState ((String)outputAlphabet.lookupObject(i), 0.0, 0.0, destinationNames, destinationNames, weightNames); } } public void addFullyConnectedStatesForThreeQuarterLabels (InstanceList trainingSet) { int numLabels = outputAlphabet.size(); for (int i = 0; i < numLabels; i++) { String[] destinationNames = new String[numLabels]; String[][] weightNames = new String[numLabels][]; for (int j = 0; j < numLabels; j++) { String labelName = (String)outputAlphabet.lookupObject(j); destinationNames[j] = labelName; weightNames[j] = new String[2]; // The "half-labels" will include all observational tests weightNames[j][0] = labelName; // The "transition" weights will include only the default feature String wn = (String)outputAlphabet.lookupObject(i) + "->" + (String)outputAlphabet.lookupObject(j); weightNames[j][1] = wn; int wi = getWeightsIndex (wn); // A new empty FeatureSelection won't allow any features here, so we only // get the default feature for transitions featureSelections[wi] = new FeatureSelection(trainingSet.getDataAlphabet()); } addState ((String)outputAlphabet.lookupObject(i), 0.0, 0.0, destinationNames, destinationNames, weightNames); } } public void addFullyConnectedStatesForBiLabels () { String[] labels = new String[outputAlphabet.size()]; // This is assuming the the entries in the outputAlphabet are Strings! for (int i = 0; i < outputAlphabet.size(); i++) { logger.info ("CRF: outputAlphabet.lookup class = "+ outputAlphabet.lookupObject(i).getClass().getName()); labels[i] = (String) outputAlphabet.lookupObject(i); } for (int i = 0; i < labels.length; i++) { for (int j = 0; j < labels.length; j++) { String[] destinationNames = new String[labels.length]; for (int k = 0; k < labels.length; k++) destinationNames[k] = labels[j]+LABEL_SEPARATOR+labels[k]; addState (labels[i]+LABEL_SEPARATOR+labels[j], 0.0, 0.0, destinationNames, labels); } } } /** Add states to create a second-order Markov model on labels, adding only those transitions the occur in the given trainingSet. */ public void addStatesForBiLabelsConnectedAsIn (InstanceList trainingSet) { int numLabels = outputAlphabet.size(); boolean[][] connections = labelConnectionsIn (trainingSet); for (int i = 0; i < numLabels; i++) { for (int j = 0; j < numLabels; j++) { if (!connections[i][j]) continue; int numDestinations = 0; for (int k = 0; k < numLabels; k++) if (connections[j][k]) numDestinations++; String[] destinationNames = new String[numDestinations]; String[] labels = new String[numDestinations]; int destinationIndex = 0; for (int k = 0; k < numLabels; k++) if (connections[j][k]) { destinationNames[destinationIndex] = (String)outputAlphabet.lookupObject(j)+LABEL_SEPARATOR+(String)outputAlphabet.lookupObject(k); labels[destinationIndex] = (String)outputAlphabet.lookupObject(k); destinationIndex++; } addState ((String)outputAlphabet.lookupObject(i)+LABEL_SEPARATOR+ (String)outputAlphabet.lookupObject(j), 0.0, 0.0, destinationNames, labels); } } } public void addFullyConnectedStatesForTriLabels () { String[] labels = new String[outputAlphabet.size()]; // This is assuming the the entries in the outputAlphabet are Strings! for (int i = 0; i < outputAlphabet.size(); i++) { logger.info ("CRF: outputAlphabet.lookup class = "+ outputAlphabet.lookupObject(i).getClass().getName()); labels[i] = (String) outputAlphabet.lookupObject(i); } for (int i = 0; i < labels.length; i++) { for (int j = 0; j < labels.length; j++) { for (int k = 0; k < labels.length; k++) { String[] destinationNames = new String[labels.length]; for (int l = 0; l < labels.length; l++) destinationNames[l] = labels[j]+LABEL_SEPARATOR+labels[k]+LABEL_SEPARATOR+labels[l]; addState (labels[i]+LABEL_SEPARATOR+labels[j]+LABEL_SEPARATOR+labels[k], 0.0, 0.0, destinationNames, labels); } } } } public void addSelfTransitioningStateForAllLabels (String name) { String[] labels = new String[outputAlphabet.size()]; String[] destinationNames = new String[outputAlphabet.size()]; // This is assuming the the entries in the outputAlphabet are Strings! for (int i = 0; i < outputAlphabet.size(); i++) { logger.info ("CRF: outputAlphabet.lookup class = "+ outputAlphabet.lookupObject(i).getClass().getName()); labels[i] = (String) outputAlphabet.lookupObject(i); destinationNames[i] = name; } addState (name, 0.0, 0.0, destinationNames, labels); } private String concatLabels(String[] labels) { String sep = ""; StringBuffer buf = new StringBuffer(); for (int i = 0; i < labels.length; i++) { buf.append(sep).append(labels[i]); sep = LABEL_SEPARATOR; } return buf.toString(); } private String nextKGram(String[] history, int k, String next) { String sep = ""; StringBuffer buf = new StringBuffer(); int start = history.length + 1 - k; for (int i = start; i < history.length; i++) { buf.append(sep).append(history[i]); sep = LABEL_SEPARATOR; } buf.append(sep).append(next); return buf.toString(); } private boolean allowedTransition(String prev, String curr, Pattern no, Pattern yes) { String pair = concatLabels(new String[]{prev, curr}); if (no != null && no.matcher(pair).matches()) return false; if (yes != null && !yes.matcher(pair).matches()) return false; return true; } private boolean allowedHistory(String[] history, Pattern no, Pattern yes) { for (int i = 1; i < history.length; i++) if (!allowedTransition(history[i-1], history[i], no, yes)) return false; return true; } /** * Assumes that the CRF's output alphabet contains * <code>String</code>s. Creates an order-<em>n</em> CRF with input * predicates and output labels given by <code>trainingSet</code> * and order, connectivity, and weights given by the remaining * arguments. * * @param trainingSet the training instances * @param orders an array of increasing non-negative numbers giving * the orders of the features for this CRF. The largest number * <em>n</em> is the Markov order of the CRF. States are * <em>n</em>-tuples of output labels. Each of the other numbers * <em>k</em> in <code>orders</code> represents a weight set shared * by all destination states whose last (most recent) <em>k</em> * labels agree. If <code>orders</code> is <code>null</code>, an * order-0 CRF is built. * @param defaults If non-null, it must be the same length as * <code>orders</code>, with <code>true</code> positions indicating * that the weight set for the corresponding order contains only the * weight for a default feature; otherwise, the weight set has * weights for all features built from input predicates. * @param start The label that represents the context of the start of * a sequence. It may be also used for sequence labels. If no label of * this name exists, one will be added. Connection wills be added between * the start label and all other labels, even if <tt>fullyConnected</tt> is * <tt>false</tt>. This argument may be null, in which case no special * start state is added. * @param forbidden If non-null, specifies what pairs of successive * labels are not allowed, both for constructing <em>n</em>order * states or for transitions. A label pair (<em>u</em>,<em>v</em>) * is not allowed if <em>u</em> + "," + <em>v</em> matches * <code>forbidden</code>. * @param allowed If non-null, specifies what pairs of successive * labels are allowed, both for constructing <em>n</em>order * states or for transitions. A label pair (<em>u</em>,<em>v</em>) * is allowed only if <em>u</em> + "," + <em>v</em> matches * <code>allowed</code>. * @param fullyConnected Whether to include all allowed transitions, * even those not occurring in <code>trainingSet</code>, * @return The name of the start state. * */ public String addOrderNStates(InstanceList trainingSet, int[] orders, boolean[] defaults, String start, Pattern forbidden, Pattern allowed, boolean fullyConnected) { boolean[][] connections = null; if (start != null) outputAlphabet.lookupIndex (start); if (!fullyConnected) connections = labelConnectionsIn (trainingSet, start); int order = -1; if (defaults != null && defaults.length != orders.length) throw new IllegalArgumentException("Defaults must be null or match orders"); if (orders == null) order = 0; else { for (int i = 0; i < orders.length; i++) if (orders[i] <= order) throw new IllegalArgumentException("Orders must be non-negative and in ascending order"); else order = orders[i]; if (order < 0) order = 0; } if (order > 0) { int[] historyIndexes = new int[order]; String[] history = new String[order]; String label0 = (String)outputAlphabet.lookupObject(0); for (int i = 0; i < order; i++) history[i] = label0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -