📄 c99.java
字号:
ContextVector.inc(stem, 1, V[i]); } } } return V;}/** * Given a document as a list of tokenised sentences, * this function produces a list of stem frequency tables, * or context vector * Creation date: (11/05/99 03:43:34) * @return uk.ac.man.cs.choif.extend.structure.ContextVector[] * @param S java.lang.String[][] * @param tf uk.ac.man.cs.choif.extend.structure.ContextVector Term frequencies in document */private final static ContextVector[] normalize(final String[][] S, ContextVector tf) { WordList stopword = WordList.stopwordList(); ContextVector[] V = new ContextVector[S.length]; String token, stem; for (int i=S.length; i-->0;) { V[i] = new ContextVector(); for (int j=S[i].length; j-->0;) { token = S[i][j].toLowerCase(); if (Punctuation.isWord(token) && !stopword.has(token)) { stem = Stemmer.stemOf(token); ContextVector.inc(stem, 1, V[i]); ContextVector.inc(stem, 1, tf); } } } return V;}/** * Apply hard ranking (replace pixel value with the proportion * of neighbouring values its greater than) to matrix using a S x S mask. * Creation date: (11/02/99 00:05:01) * @param M float[][] * @param S int */private final static float[][] rank(final float[][] F, final int S) { float[][] M = new float[F.length][F.length]; /* Compute the offset used for mask */ final int dS = (S % 2 == 1 ? S / 2 : (S-1) / 2); /* Work on M, refers to F */ int K_is, K_ie, K_js, K_je; float v, sum; for (int M_i=M.length; M_i-->0;) { for (int M_j=M_i+1; M_j-->0;) { v = F[M_i][M_j]; // Grab pixel value M[M_i][M_j] = 0; // Set it to 0 /* Compute effective mask range */ K_is = M_i - dS; if (K_is < 0) K_is = 0; K_ie = M_i + dS + 1; if (K_ie > F.length) K_ie = F.length; K_js = M_j - dS; if (K_js < 0) K_js = 0; K_je = M_j + dS + 1; if (K_je > F.length) K_je = F.length; /* Compute active mask region area for normalization. Subtract 1 because we ignore the middle pixel which will always be rank 0. */ sum = (K_ie - K_is) * (K_je - K_js) - 1; /* Perform ranking */ if (sum > 0) { for (int K_i=K_ie; K_i-- > K_is;) { for (int K_j=K_je; K_j-- > K_js;) { if (v > F[K_i][K_j]) M[M_i][M_j]++; } } M[M_i][M_j] /= sum; } M[M_j][M_i] = M[M_i][M_j]; } } return M;}/** * Given a document as a list of elementary text blocks * (usually tokenised sentences), segment the document into n * coherent topic segments. If n is -1, the algorithm * will decide the appropriate number of segments by * monitoring the rate of increase in segment density. * Creation date: (11/05/99 05:55:46) * @return String[][] A list of coherent topic segments * @param String[] A list of elementary text blocks (usually sentences). Each block is a string of space separated tokens. * @param n int Number of segments to make, if -1 then let the algorithm decide. * @param s int Size of ranking mask, must be >= 3 and an odd number */public final static String[][][] segment(final String[][] document, final int n, final int s) { Debugx.msg("C99", "Context vectors..."); ContextVector[] vectors = normalize(document); Debugx.msg("C99", "Similarity matrix..."); float[][] sim = similarity(vectors); vectors = null; Debugx.msg("C99", "Rank matrix (" + s + "x" + s + " rank mask)..."); float[][] rank = rank(sim, s); sim = null; Debugx.msg("C99", "Sum of rank matrix..."); float[][] sum = sum(rank); rank = null; Debugx.msg("C99", "Divisive clustering (" + (n==-1 ? "automatic" : "user") + " termination)..."); int[] B = Arrayx.sortAsc(boundaries(sum, n)); sum = null; Debugx.msg("C99", "Found " + (B.length+1) + " segments..."); return split(document, B);}/** * Given a document as a list of elementary text blocks * (usually tokenised sentences), segment the document into n * coherent topic segments. If n is -1, the algorithm * will decide the appropriate number of segments by * monitoring the rate of increase in segment density. * Creation date: (11/05/99 05:55:46) * @return String[][] A list of coherent topic segments * @param String[] A list of elementary text blocks (usually sentences). Each block is a string of space separated tokens. * @param n int Number of segments to make, if -1 then let the algorithm decide. * @param s int Size of ranking mask, must be >= 3 and an odd number */public final static String[][][] segmentW(final String[][] document, final int n, final int s) { Debugx.msg("C99", "Context vectors..."); ContextVector tf = new ContextVector(); ContextVector[] vectors = normalize(document, tf); Debugx.msg("C99", "Similarity matrix..."); EntropyVector ev = new EntropyVector(tf); float[][] sim = similarity(vectors, ev); vectors = null; Debugx.msg("C99", "Rank matrix (" + s + "x" + s + " rank mask)..."); float[][] rank = rank(sim, s); sim = null; Debugx.msg("C99", "Sum of rank matrix..."); float[][] sum = sum(rank); rank = null; Debugx.msg("C99", "Divisive clustering (" + (n==-1 ? "automatic" : "user") + " termination)..."); int[] B = Arrayx.sortAsc(boundaries(sum, n)); sum = null; Debugx.msg("C99", "Found " + (B.length+1) + " segments..."); return split(document, B);}/** * Given a list fo context vector, compute the similarity matrix * Creation date: (11/05/99 04:45:51) * @return float[][] * @param v uk.ac.man.cs.choif.extend.structure.ContextVector[] */private final static float[][] similarity(final ContextVector[] v) { float[][] S = new float[v.length][v.length]; for (int i=v.length; i-->0;) { for (int j=i+1; j-->0;) { S[i][j] = ContextVector.cos(v[i], v[j]); S[j][i] = S[i][j]; } } return S;}/** * Given a list fo context vector, compute the similarity matrix * Creation date: (11/05/99 04:45:51) * @return float[][] * @param v uk.ac.man.cs.choif.extend.structure.ContextVector[] */private final static float[][] similarity(final ContextVector[] v, final EntropyVector entropy) { float[][] S = new float[v.length][v.length]; for (int i=v.length; i-->0;) { for (int j=i+1; j-->0;) { S[i][j] = ContextVector.cos(v[i], v[j], entropy); S[j][i] = S[i][j]; } } return S;}/** * Given the input text and the topic boundaries, * split the text into segment blocks. * Creation date: (08/16/99 06:56:26) * @return String[][][] Topic segments * @param T String[][] Source text * @param B int[] Boundaries */private final static String[][][] split (final String[][] T, final int[] B) { /* Add the implicit boundaries (start and end) to B */ int[] b = new int[B.length+2]; // A list of boundaries, includes implicit boundaries b[0] = 0; b[b.length-1] = T.length; System.arraycopy(B, 0, b, 1, B.length); /* Make the topic segments */ String[][][] seg = new String[b.length-1][][]; for (int i=seg.length; i-->0;) { seg[i] = new String[b[i+1] - b[i]][]; System.arraycopy(T, b[i], seg[i], 0, b[i+1] - b[i]); } return seg;}/** * Compute the sum of rank matrix * Creation date: (11/05/99 04:56:32) * @return float[][] * @param M float[][] */private final static float[][] sum(final float[][] M) { float[][] S = new float[M.length][M.length]; /* Step 1 */ for (int i=0, ie=M.length; i<ie; i++) S[i][i] = M[i][i]; /* Step 2 */ for (int i=0, ie=M.length-1, ip; i<ie; i++) { ip = i+1; S[ip][i] = M[ip][i] * 2 + S[i][i] + S[ip][ip]; S[i][ip] = S[ip][i]; } /* Step 3 */ for (int j=2, ij, ip; j<M.length; j++) { for (int i=0, ie=M.length-j; i<ie; i++) { ij = i+j; ip = i+1; S[ij][i] = M[ij][i] * 2 + S[ij-1][i] + S[ij][ip] - S[ij-1][ip]; S[i][ij] = S[ij][i]; } } return S;}}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -