⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 crf4.java

📁 mallet是自然语言处理、机器学习领域的一个开源项目。
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
  private boolean[][] labelConnectionsIn (InstanceList trainingSet)  {    return labelConnectionsIn (trainingSet, null);  }	private boolean[][] labelConnectionsIn (InstanceList trainingSet, String start)	{		int numLabels = outputAlphabet.size();		boolean[][] connections = new boolean[numLabels][numLabels];		for (int i = 0; i < trainingSet.size(); i++) {			Instance instance = trainingSet.getInstance(i);			FeatureSequence output = (FeatureSequence) instance.getTarget();			for (int j = 1; j < output.size(); j++) {				int sourceIndex = outputAlphabet.lookupIndex (output.get(j-1));				int destIndex = outputAlphabet.lookupIndex (output.get(j));				assert (sourceIndex >= 0 && destIndex >= 0);				connections[sourceIndex][destIndex] = true;			}		}    // Handle start state    if (start != null) {      int startIndex = outputAlphabet.lookupIndex (start);      for (int j = 0; j < outputAlphabet.size(); j++) {        connections[startIndex][j] = true;      }    }		return connections;	}	/** Add states to create a first-order Markov model on labels,			adding only those transitions the occur in the given			trainingSet. */	public void addStatesForLabelsConnectedAsIn (InstanceList trainingSet)	{		int numLabels = outputAlphabet.size();		boolean[][] connections = labelConnectionsIn (trainingSet);		for (int i = 0; i < numLabels; i++) {			int numDestinations = 0;			for (int j = 0; j < numLabels; j++)				if (connections[i][j]) numDestinations++;			String[] destinationNames = new String[numDestinations];			int destinationIndex = 0;			for (int j = 0; j < numLabels; j++)				if (connections[i][j])					destinationNames[destinationIndex++] = (String)outputAlphabet.lookupObject(j);			addState ((String)outputAlphabet.lookupObject(i), destinationNames);		}	}	/** Add as many states as there are labels, but don't create separate weights			for each source-destination pair of states.  Instead have all the incoming			transitions to a state share the same weights. */	public void addStatesForHalfLabelsConnectedAsIn (InstanceList trainingSet)	{		int numLabels = outputAlphabet.size();		boolean[][] connections = labelConnectionsIn (trainingSet);		for (int i = 0; i < numLabels; i++) {			int numDestinations = 0;			for (int j = 0; j < numLabels; j++)				if (connections[i][j]) numDestinations++;			String[] destinationNames = new String[numDestinations];			int destinationIndex = 0;			for (int j = 0; j < numLabels; j++)				if (connections[i][j])					destinationNames[destinationIndex++] = (String)outputAlphabet.lookupObject(j);			addState ((String)outputAlphabet.lookupObject(i), 0.0, 0.0,								destinationNames, destinationNames, destinationNames);		}	}	/** Add as many states as there are labels, but don't create			separate observational-test-weights for each source-destination			pair of states---instead have all the incoming transitions to a			state share the same observational-feature-test weights.			However, do create separate default feature for each transition,			(which acts as an HMM-style transition probability). */	public void addStatesForThreeQuarterLabelsConnectedAsIn (InstanceList trainingSet)	{		int numLabels = outputAlphabet.size();		boolean[][] connections = labelConnectionsIn (trainingSet);		for (int i = 0; i < numLabels; i++) {			int numDestinations = 0;			for (int j = 0; j < numLabels; j++)				if (connections[i][j]) numDestinations++;			String[] destinationNames = new String[numDestinations];			String[][] weightNames = new String[numDestinations][];			int destinationIndex = 0;			for (int j = 0; j < numLabels; j++)				if (connections[i][j]) {					String labelName = (String)outputAlphabet.lookupObject(j);					destinationNames[destinationIndex] = labelName;					weightNames[destinationIndex] = new String[2];					// The "half-labels" will include all observational tests					weightNames[destinationIndex][0] = labelName;					// The "transition" weights will include only the default feature					String wn = (String)outputAlphabet.lookupObject(i) + "->" + (String)outputAlphabet.lookupObject(j);					weightNames[destinationIndex][1] = wn;					int wi = getWeightsIndex (wn);					// A new empty FeatureSelection won't allow any features here, so we only					// get the default feature for transitions					featureSelections[wi] = new FeatureSelection(trainingSet.getDataAlphabet());					destinationIndex++;				}			addState ((String)outputAlphabet.lookupObject(i), 0.0, 0.0,								destinationNames, destinationNames, weightNames);		}	}	public void addFullyConnectedStatesForThreeQuarterLabels (InstanceList trainingSet)	{		int numLabels = outputAlphabet.size();		for (int i = 0; i < numLabels; i++) {			String[] destinationNames = new String[numLabels];			String[][] weightNames = new String[numLabels][];			for (int j = 0; j < numLabels; j++) {				String labelName = (String)outputAlphabet.lookupObject(j);				destinationNames[j] = labelName;				weightNames[j] = new String[2];				// The "half-labels" will include all observational tests				weightNames[j][0] = labelName;				// The "transition" weights will include only the default feature				String wn = (String)outputAlphabet.lookupObject(i) + "->" + (String)outputAlphabet.lookupObject(j);				weightNames[j][1] = wn;				int wi = getWeightsIndex (wn);				// A new empty FeatureSelection won't allow any features here, so we only				// get the default feature for transitions				featureSelections[wi] = new FeatureSelection(trainingSet.getDataAlphabet());			}			addState ((String)outputAlphabet.lookupObject(i), 0.0, 0.0,								destinationNames, destinationNames, weightNames);		}	}		public void addFullyConnectedStatesForBiLabels ()	{		String[] labels = new String[outputAlphabet.size()];		// This is assuming the the entries in the outputAlphabet are Strings!		for (int i = 0; i < outputAlphabet.size(); i++) {			logger.info ("CRF: outputAlphabet.lookup class = "+									 outputAlphabet.lookupObject(i).getClass().getName());			labels[i] = (String) outputAlphabet.lookupObject(i);		}		for (int i = 0; i < labels.length; i++) {			for (int j = 0; j < labels.length; j++) {				String[] destinationNames = new String[labels.length];				for (int k = 0; k < labels.length; k++)					destinationNames[k] = labels[j]+LABEL_SEPARATOR+labels[k];				addState (labels[i]+LABEL_SEPARATOR+labels[j], 0.0, 0.0,									destinationNames, labels);			}		}	}	/** Add states to create a second-order Markov model on labels,			adding only those transitions the occur in the given			trainingSet. */	public void addStatesForBiLabelsConnectedAsIn (InstanceList trainingSet)	{		int numLabels = outputAlphabet.size();		boolean[][] connections = labelConnectionsIn (trainingSet);		for (int i = 0; i < numLabels; i++) {			for (int j = 0; j < numLabels; j++) {				if (!connections[i][j])					continue;				int numDestinations = 0;				for (int k = 0; k < numLabels; k++)					if (connections[j][k]) numDestinations++;				String[] destinationNames = new String[numDestinations];				String[] labels = new String[numDestinations];				int destinationIndex = 0;				for (int k = 0; k < numLabels; k++)					if (connections[j][k]) {						destinationNames[destinationIndex] =							(String)outputAlphabet.lookupObject(j)+LABEL_SEPARATOR+(String)outputAlphabet.lookupObject(k);						labels[destinationIndex] = (String)outputAlphabet.lookupObject(k);						destinationIndex++;					}				addState ((String)outputAlphabet.lookupObject(i)+LABEL_SEPARATOR+									(String)outputAlphabet.lookupObject(j), 0.0, 0.0,									destinationNames, labels);			}		}	}		public void addFullyConnectedStatesForTriLabels ()	{		String[] labels = new String[outputAlphabet.size()];		// This is assuming the the entries in the outputAlphabet are Strings!		for (int i = 0; i < outputAlphabet.size(); i++) {			logger.info ("CRF: outputAlphabet.lookup class = "+									 outputAlphabet.lookupObject(i).getClass().getName());			labels[i] = (String) outputAlphabet.lookupObject(i);		}		for (int i = 0; i < labels.length; i++) {			for (int j = 0; j < labels.length; j++) {				for (int k = 0; k < labels.length; k++) {					String[] destinationNames = new String[labels.length];					for (int l = 0; l < labels.length; l++)						destinationNames[l] = labels[j]+LABEL_SEPARATOR+labels[k]+LABEL_SEPARATOR+labels[l];					addState (labels[i]+LABEL_SEPARATOR+labels[j]+LABEL_SEPARATOR+labels[k], 0.0, 0.0,										destinationNames, labels);				}			}		}	}		public void addSelfTransitioningStateForAllLabels (String name)	{		String[] labels = new String[outputAlphabet.size()];		String[] destinationNames  = new String[outputAlphabet.size()];		// This is assuming the the entries in the outputAlphabet are Strings!		for (int i = 0; i < outputAlphabet.size(); i++) {			logger.info ("CRF: outputAlphabet.lookup class = "+													outputAlphabet.lookupObject(i).getClass().getName());			labels[i] = (String) outputAlphabet.lookupObject(i);			destinationNames[i] = name;		}		addState (name, 0.0, 0.0, destinationNames, labels);	}  private String concatLabels(String[] labels)  {    String sep = "";    StringBuffer buf = new StringBuffer();    for (int i = 0; i < labels.length; i++)    {      buf.append(sep).append(labels[i]);      sep = LABEL_SEPARATOR;    }    return buf.toString();  }    private String nextKGram(String[] history, int k, String next)  {    String sep = "";    StringBuffer buf = new StringBuffer();    int start = history.length + 1 - k;    for (int i = start; i < history.length; i++)    {      buf.append(sep).append(history[i]);      sep = LABEL_SEPARATOR;    }    buf.append(sep).append(next);    return buf.toString();  }    private boolean allowedTransition(String prev, String curr,                                    Pattern no, Pattern yes)  {    String pair = concatLabels(new String[]{prev, curr});    if (no != null && no.matcher(pair).matches())      return false;    if (yes != null && !yes.matcher(pair).matches())      return false;    return true;  }      private boolean allowedHistory(String[] history, Pattern no, Pattern yes) {    for (int i = 1; i < history.length; i++)      if (!allowedTransition(history[i-1], history[i], no, yes))        return false;    return true;  }  /**   * Assumes that the CRF's output alphabet contains   * <code>String</code>s. Creates an order-<em>n</em> CRF with input   * predicates and output labels given by <code>trainingSet</code>   * and order, connectivity, and weights given by the remaining   * arguments.   *   * @param trainingSet the training instances   * @param orders an array of increasing non-negative numbers giving   * the orders of the features for this CRF. The largest number   * <em>n</em> is the Markov order of the CRF. States are   * <em>n</em>-tuples of output labels. Each of the other numbers   * <em>k</em> in <code>orders</code> represents a weight set shared   * by all destination states whose last (most recent) <em>k</em>   * labels agree. If <code>orders</code> is <code>null</code>, an   * order-0 CRF is built.   * @param defaults If non-null, it must be the same length as   * <code>orders</code>, with <code>true</code> positions indicating   * that the weight set for the corresponding order contains only the   * weight for a default feature; otherwise, the weight set has   * weights for all features built from input predicates.   * @param start The label that represents the context of the start of   * a sequence. It may be also used for sequence labels.  If no label of   * this name exists, one will be added. Connection wills be added between   * the start label and all other labels, even if <tt>fullyConnected</tt> is   * <tt>false</tt>.  This argument may be null, in which case no special   * start state is added.   * @param forbidden If non-null, specifies what pairs of successive   * labels are not allowed, both for constructing <em>n</em>order   * states or for transitions. A label pair (<em>u</em>,<em>v</em>)   * is not allowed if <em>u</em> + "," + <em>v</em> matches   * <code>forbidden</code>.   * @param allowed If non-null, specifies what pairs of successive   * labels are allowed, both for constructing <em>n</em>order   * states or for transitions. A label pair (<em>u</em>,<em>v</em>)   * is allowed only if <em>u</em> + "," + <em>v</em> matches   * <code>allowed</code>.   * @param fullyConnected Whether to include all allowed transitions,   * even those not occurring in <code>trainingSet</code>,   * @return The name of the start state.   *    */  public String addOrderNStates(InstanceList trainingSet, int[] orders,                                boolean[] defaults, String start,                                Pattern forbidden, Pattern allowed,                                boolean fullyConnected)  {    boolean[][] connections = null;    if (start != null)      outputAlphabet.lookupIndex (start);    if (!fullyConnected)      connections = labelConnectionsIn (trainingSet, start);    int order = -1;    if (defaults != null && defaults.length != orders.length)      throw new IllegalArgumentException("Defaults must be null or match orders");    if (orders == null)      order = 0;    else    {      for (int i = 0; i < orders.length; i++)        if (orders[i] <= order)          throw new IllegalArgumentException("Orders must be non-negative and in ascending order");        else           order = orders[i];      if (order < 0) order = 0;    }    if (order > 0)    {      int[] historyIndexes = new int[order];      String[] history = new String[order];      String label0 = (String)outputAlphabet.lookupObject(0);      for (int i = 0; i < order; i++)        history[i] = label0;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -