📄 c99.java

📁 n algorithm for domain independent linear text segmentation This the Windows version of the C99 al
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package uk.ac.man.cs.choif.nlp.seg.linear;import java.util.Vector;import uk.ac.man.cs.choif.extend.Argx;import uk.ac.man.cs.choif.extend.Arrayx;import uk.ac.man.cs.choif.extend.Debugx;import uk.ac.man.cs.choif.extend.Stringx;import uk.ac.man.cs.choif.extend.Vectorx;import uk.ac.man.cs.choif.extend.io.LineInput;import uk.ac.man.cs.choif.extend.io.LineOutput;import uk.ac.man.cs.choif.extend.structure.ContextVector;import uk.ac.man.cs.choif.extend.structure.EntropyVector;import uk.ac.man.cs.choif.nlp.surface.Punctuation;import uk.ac.man.cs.choif.nlp.surface.Stemmer;import uk.ac.man.cs.choif.nlp.surface.WordList;//import uk.ac.man.cs.choif.extend.GnuPlotx;import uk.ac.man.cs.choif.nlp.statistics.distribution.*;import uk.ac.man.cs.choif.extend.Statx;/** * C99 algorithm for linear text segmentation * Creation date: (11/05/99 03:30:09) * @author: Freddy Choi */public class C99 {	private final static class Region {		protected int start, end, area = 1;		float sum = 0;		protected Region left = null;		protected Region right = null;		/**		 * Construct a new region with boundaries start and end.		 * @param start int		 * @param end int		 * @param M float[][]		 */		public Region(final int start, final int end, final float[][] M) {			this.start = start;			this.end = end;			area = (end - start + 1) * (end - start + 1);			sum = M[end][start];		}		/**		 * For the region find a boundary that maximises the inside density		 * of the region. The result is stored in the region itself.		 * @param M float[][]		 * @param R Region		 */		public final static void bestBoundary_max(final float[][] M, Region R) {			final int start = R.start;			final int end = R.end;			if (start < end) {				/* For each possible boundary, compute the density to				find the best boundary */				int B = start;				float D_max = 0, D = 0;				for (int i = end, lower = end - start, upper = 1; i-- > start; lower--, upper++) {					/* Compute the inside density */					D = (M[i][start] + M[end][i + 1]) / (float) ((lower * lower) + (upper * upper));					/* Test if it is the maximum density */					if (D > D_max) {						D_max = D;						B = i;					}				}				/* Construct result */				R.left = new Region(start, B, M);				R.right = new Region(B + 1, end, M);			}		}	}/** * For the similarity matrix M, find the boundaries for n regions that * maximizes the inside density. M is obtained with  * Functions.computeSum(float[][] S). * @return Vector A list of boundary in the order that they were selected * @param M float[][] * @param n int Number of regions to find, if -1, the algorithm will decide. */private final static int[] boundaries (final float[][] M, final int n) {	/* A list of regions that can be split. Initialise	it with the entire document as the first region. */	Vector R = new Vector();					// List of splitable regions	Region r = new Region(0, M.length-1, M);	// The entire document as the first region	Region.bestBoundary_max(M, r);				// Find the best split point	R.addElement(r);							// Initialise R with the entire document	/* Split the document right down to elementary units,	i.e. sentences. The boundary selected at each step of	this divisive clustering process is placed in a list B.	Each step increases the inside density, this increase 	is placed in a list dD. */		int[] B = new int[M.length-1];				// A list of boundary locations	float[] dD = new float[M.length-1]	;		// Increase in density due to the split	/* Temporary program variables */	float sum_region=r.sum;						// Sum of similarity values for segmentation	float sum_area = r.area;					// Inside area of segmentation	float D = sum_region / sum_area;			// The current density	float tempD;								// Density of test segmentation	float maxD;									// Maximum density	int index=0;								// Index of region to split	/* Divisive clustering (user or automatic termination) */	for (int i=0, ie=(n != -1 ? n-1 : M.length-1); i<ie; i++) {		/* Find the default region to split. It is		the first splitable region */		index = 0;		do {			r = (Region) R.elementAt(index++);		} while (index < R.size() && r.start == r.end);		index--;				/* Find the best region to split. It is		one that maximizes the overall density */		maxD = Float.MIN_VALUE;		for (int j=R.size(); j-->0;) {			r = (Region) R.elementAt(j);			/* Test if region can be split */			if (r.start < r.end) {				/* Divide the new sum of region by the new area */				tempD = (sum_region-r.sum+r.left.sum+r.right.sum) / (float) (sum_area-r.area+r.left.area+r.right.area);				/* Maximize */							if (tempD > maxD) { maxD = tempD; index = j; }			}		}		/* Split region at index into two */		r = (Region) R.elementAt(index);		Region.bestBoundary_max(M, r.left);		// Find best split point for left region		Region.bestBoundary_max(M, r.right);	// Find best split point for right region		R.setElementAt(r.right, index);		R.insertElementAt(r.left, index);		/* Store results from a step of divisive clustering */		B[i] = r.right.start;					// Boundary location		dD[i] = maxD - D;						// Compute change in density		D = maxD;								// Update current density				/* Update sums */		sum_region = sum_region - r.sum + r.left.sum + r.right.sum;		sum_area = sum_area - r.area + r.left.area + r.right.area;	}	/* Now that we have an ordered list of boundary locations,	the next problem is to decide the number of segments to	have. Granularity is task dependent. The following method	computes # segments by tracking the change in density	caused by each split. */	int[] result;								// Output boundaries	if (n != -1) {		/* User specified # segments */		result = new int[n-1];		System.arraycopy(B, 0, result, 0, n-1);		return result;	}	else {		/* Determine # segments */		dD = Statx.convolve(dD, new float[]{1,2,4,8,4,2,1});	// Smooth gradient		Acc dist = new Acc();									// Capture distribution of gradient		dist.acc(dD);		double threshold = dist.mean() + 1.2 * dist.sd();		// Threshold to capture only unusually large gradient		int number = 0;											// Determine the number of boundaries to have		for (; number<dD.length && dD[number] > threshold; number++);		result = new int[number];								// Generate result		System.arraycopy(B, 0, result, 0, number);		return result;	}}/** * Segment a piece of text using the C99 algorithm * Creation date: (11/05/99 06:05:36) * @param args java.lang.String[] */public final static void main(String[] args) {	Debugx.header("This is C99, an algorithm for linear text segmentation.");	/* Command line arguments processing */	Argx arg = new Argx(args);	int n = arg.get("-n", -1, "Number of segments (default automatic = -1)");	int s = arg.get("-s", 11, "Size of ranking mask (>= 1 and an odd number)");	boolean w = arg.has("-w", false, "Weight context vector with term frequencies");	arg.displayHelp();	if (s < 1 || s % 2 != 1) Debugx.handle(new Error("Parameter -s must be >= 1 and an odd number."));	/* Load document */	Debugx.msg("C99", "Loading document...");	String[][] D = Stringx.tokenize(LineInput.load(new LineInput(System.in)), " ");	//String[][] D = null;	//try { D = Stringx.tokenize(LineInput.load(new LineInput(new java.io.File("/root/temp/test.txt"))), " "); }	//catch (java.io.IOException e) {}	/* Perform segmentation */	String[][][] S = (w ? segmentW(D, n, s) : segment(D, n, s));	Debugx.msg("C99", "Ready.");	/* Print output */	final String sep = "==========";	String line;	LineOutput out = new LineOutput(System.out);	for (int i=0, ie=S.length; i<ie; i++) {		out.println(sep);		for (int j=0, je=S[i].length; j<je; j++) {			line = "";			for (int k=0, ke=S[i][j].length; k<ke; k++) line += (S[i][j][k] + " ");			out.println(line);		}	}	out.println(sep);	out.close();}/** * Given a document as a list of tokenised sentences,  * this function produces a list of stem frequency tables, * or context vector * Creation date: (11/05/99 03:43:34) * @return uk.ac.man.cs.choif.extend.structure.ContextVector[] * @param S java.lang.String[][] */private final static ContextVector[] normalize(final String[][] S) {	WordList stopword = WordList.stopwordList();	ContextVector[] V = new ContextVector[S.length];	String token, stem;	for (int i=S.length; i-->0;) {		V[i] = new ContextVector();		for (int j=S[i].length; j-->0;) {			token = S[i][j].toLowerCase();			if (Punctuation.isWord(token) && !stopword.has(token)) {				stem = Stemmer.stemOf(token);
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -