📄 c99.java
字号:
package uk.ac.man.cs.choif.nlp.seg.linear;import java.util.Vector;import uk.ac.man.cs.choif.extend.Argx;import uk.ac.man.cs.choif.extend.Arrayx;import uk.ac.man.cs.choif.extend.Debugx;import uk.ac.man.cs.choif.extend.Stringx;import uk.ac.man.cs.choif.extend.Vectorx;import uk.ac.man.cs.choif.extend.io.LineInput;import uk.ac.man.cs.choif.extend.io.LineOutput;import uk.ac.man.cs.choif.extend.structure.ContextVector;import uk.ac.man.cs.choif.extend.structure.EntropyVector;import uk.ac.man.cs.choif.nlp.surface.Punctuation;import uk.ac.man.cs.choif.nlp.surface.Stemmer;import uk.ac.man.cs.choif.nlp.surface.WordList;//import uk.ac.man.cs.choif.extend.GnuPlotx;import uk.ac.man.cs.choif.nlp.statistics.distribution.*;import uk.ac.man.cs.choif.extend.Statx;/** * C99 algorithm for linear text segmentation * Creation date: (11/05/99 03:30:09) * @author: Freddy Choi */public class C99 { private final static class Region { protected int start, end, area = 1; float sum = 0; protected Region left = null; protected Region right = null; /** * Construct a new region with boundaries start and end. * @param start int * @param end int * @param M float[][] */ public Region(final int start, final int end, final float[][] M) { this.start = start; this.end = end; area = (end - start + 1) * (end - start + 1); sum = M[end][start]; } /** * For the region find a boundary that maximises the inside density * of the region. The result is stored in the region itself. * @param M float[][] * @param R Region */ public final static void bestBoundary_max(final float[][] M, Region R) { final int start = R.start; final int end = R.end; if (start < end) { /* For each possible boundary, compute the density to find the best boundary */ int B = start; float D_max = 0, D = 0; for (int i = end, lower = end - start, upper = 1; i-- > start; lower--, upper++) { /* Compute the inside density */ D = (M[i][start] + M[end][i + 1]) / (float) ((lower * lower) + (upper * upper)); /* Test if it is the maximum density */ if (D > D_max) { D_max = D; B = i; } } /* Construct result */ R.left = new Region(start, B, M); R.right = new Region(B + 1, end, M); } } }/** * For the similarity matrix M, find the boundaries for n regions that * maximizes the inside density. M is obtained with * Functions.computeSum(float[][] S). * @return Vector A list of boundary in the order that they were selected * @param M float[][] * @param n int Number of regions to find, if -1, the algorithm will decide. */private final static int[] boundaries (final float[][] M, final int n) { /* A list of regions that can be split. Initialise it with the entire document as the first region. */ Vector R = new Vector(); // List of splitable regions Region r = new Region(0, M.length-1, M); // The entire document as the first region Region.bestBoundary_max(M, r); // Find the best split point R.addElement(r); // Initialise R with the entire document /* Split the document right down to elementary units, i.e. sentences. The boundary selected at each step of this divisive clustering process is placed in a list B. Each step increases the inside density, this increase is placed in a list dD. */ int[] B = new int[M.length-1]; // A list of boundary locations float[] dD = new float[M.length-1] ; // Increase in density due to the split /* Temporary program variables */ float sum_region=r.sum; // Sum of similarity values for segmentation float sum_area = r.area; // Inside area of segmentation float D = sum_region / sum_area; // The current density float tempD; // Density of test segmentation float maxD; // Maximum density int index=0; // Index of region to split /* Divisive clustering (user or automatic termination) */ for (int i=0, ie=(n != -1 ? n-1 : M.length-1); i<ie; i++) { /* Find the default region to split. It is the first splitable region */ index = 0; do { r = (Region) R.elementAt(index++); } while (index < R.size() && r.start == r.end); index--; /* Find the best region to split. It is one that maximizes the overall density */ maxD = Float.MIN_VALUE; for (int j=R.size(); j-->0;) { r = (Region) R.elementAt(j); /* Test if region can be split */ if (r.start < r.end) { /* Divide the new sum of region by the new area */ tempD = (sum_region-r.sum+r.left.sum+r.right.sum) / (float) (sum_area-r.area+r.left.area+r.right.area); /* Maximize */ if (tempD > maxD) { maxD = tempD; index = j; } } } /* Split region at index into two */ r = (Region) R.elementAt(index); Region.bestBoundary_max(M, r.left); // Find best split point for left region Region.bestBoundary_max(M, r.right); // Find best split point for right region R.setElementAt(r.right, index); R.insertElementAt(r.left, index); /* Store results from a step of divisive clustering */ B[i] = r.right.start; // Boundary location dD[i] = maxD - D; // Compute change in density D = maxD; // Update current density /* Update sums */ sum_region = sum_region - r.sum + r.left.sum + r.right.sum; sum_area = sum_area - r.area + r.left.area + r.right.area; } /* Now that we have an ordered list of boundary locations, the next problem is to decide the number of segments to have. Granularity is task dependent. The following method computes # segments by tracking the change in density caused by each split. */ int[] result; // Output boundaries if (n != -1) { /* User specified # segments */ result = new int[n-1]; System.arraycopy(B, 0, result, 0, n-1); return result; } else { /* Determine # segments */ dD = Statx.convolve(dD, new float[]{1,2,4,8,4,2,1}); // Smooth gradient Acc dist = new Acc(); // Capture distribution of gradient dist.acc(dD); double threshold = dist.mean() + 1.2 * dist.sd(); // Threshold to capture only unusually large gradient int number = 0; // Determine the number of boundaries to have for (; number<dD.length && dD[number] > threshold; number++); result = new int[number]; // Generate result System.arraycopy(B, 0, result, 0, number); return result; }}/** * Segment a piece of text using the C99 algorithm * Creation date: (11/05/99 06:05:36) * @param args java.lang.String[] */public final static void main(String[] args) { Debugx.header("This is C99, an algorithm for linear text segmentation."); /* Command line arguments processing */ Argx arg = new Argx(args); int n = arg.get("-n", -1, "Number of segments (default automatic = -1)"); int s = arg.get("-s", 11, "Size of ranking mask (>= 1 and an odd number)"); boolean w = arg.has("-w", false, "Weight context vector with term frequencies"); arg.displayHelp(); if (s < 1 || s % 2 != 1) Debugx.handle(new Error("Parameter -s must be >= 1 and an odd number.")); /* Load document */ Debugx.msg("C99", "Loading document..."); String[][] D = Stringx.tokenize(LineInput.load(new LineInput(System.in)), " "); //String[][] D = null; //try { D = Stringx.tokenize(LineInput.load(new LineInput(new java.io.File("/root/temp/test.txt"))), " "); } //catch (java.io.IOException e) {} /* Perform segmentation */ String[][][] S = (w ? segmentW(D, n, s) : segment(D, n, s)); Debugx.msg("C99", "Ready."); /* Print output */ final String sep = "=========="; String line; LineOutput out = new LineOutput(System.out); for (int i=0, ie=S.length; i<ie; i++) { out.println(sep); for (int j=0, je=S[i].length; j<je; j++) { line = ""; for (int k=0, ke=S[i][j].length; k<ke; k++) line += (S[i][j][k] + " "); out.println(line); } } out.println(sep); out.close();}/** * Given a document as a list of tokenised sentences, * this function produces a list of stem frequency tables, * or context vector * Creation date: (11/05/99 03:43:34) * @return uk.ac.man.cs.choif.extend.structure.ContextVector[] * @param S java.lang.String[][] */private final static ContextVector[] normalize(final String[][] S) { WordList stopword = WordList.stopwordList(); ContextVector[] V = new ContextVector[S.length]; String token, stem; for (int i=S.length; i-->0;) { V[i] = new ContextVector(); for (int j=S[i].length; j-->0;) { token = S[i][j].toLowerCase(); if (Punctuation.isWord(token) && !stopword.has(token)) { stem = Stemmer.stemOf(token);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -