📄 c99.java

📁 n algorithm for domain independent linear text segmentation This the Windows version of the C99 al
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
				ContextVector.inc(stem, 1, V[i]);			}		}	}			return V;}/** * Given a document as a list of tokenised sentences,  * this function produces a list of stem frequency tables, * or context vector * Creation date: (11/05/99 03:43:34) * @return uk.ac.man.cs.choif.extend.structure.ContextVector[] * @param S java.lang.String[][] * @param tf uk.ac.man.cs.choif.extend.structure.ContextVector Term frequencies in document */private final static ContextVector[] normalize(final String[][] S, ContextVector tf) {	WordList stopword = WordList.stopwordList();	ContextVector[] V = new ContextVector[S.length];	String token, stem;	for (int i=S.length; i-->0;) {		V[i] = new ContextVector();		for (int j=S[i].length; j-->0;) {			token = S[i][j].toLowerCase();			if (Punctuation.isWord(token) && !stopword.has(token)) {				stem = Stemmer.stemOf(token);				ContextVector.inc(stem, 1, V[i]);				ContextVector.inc(stem, 1, tf);			}		}	}			return V;}/** * Apply hard ranking (replace pixel value with the proportion  * of neighbouring values its greater than) to matrix using a S x S mask. * Creation date: (11/02/99 00:05:01) * @param M float[][] * @param S int */private final static float[][] rank(final float[][] F, final int S) {	float[][] M = new float[F.length][F.length];		/* Compute the offset used for mask */	final int dS = (S % 2 == 1 ? S / 2 : (S-1) / 2);	/* Work on M, refers to F */	int K_is, K_ie, K_js, K_je;	float v, sum;	for (int M_i=M.length; M_i-->0;) {		for (int M_j=M_i+1; M_j-->0;) {			v = F[M_i][M_j]; // Grab pixel value			M[M_i][M_j] = 0; // Set it to 0			/* Compute effective mask range */			K_is = M_i - dS; if (K_is < 0) K_is = 0;			K_ie = M_i + dS + 1; if (K_ie > F.length) K_ie = F.length;			K_js = M_j - dS; if (K_js < 0) K_js = 0;			K_je = M_j + dS + 1; if (K_je > F.length) K_je = F.length;			/* Compute active mask region area for normalization. Subtract 1			because we ignore the middle pixel which will always be rank 0. */			sum = (K_ie - K_is) * (K_je - K_js) - 1;			/* Perform ranking */			if (sum > 0) {				for (int K_i=K_ie; K_i-- > K_is;) {					for (int K_j=K_je; K_j-- > K_js;) {						if (v > F[K_i][K_j]) M[M_i][M_j]++;					}				}				M[M_i][M_j] /= sum;			}						M[M_j][M_i] = M[M_i][M_j];		}	}	return M;}/** * Given a document as a list of elementary text blocks * (usually tokenised sentences), segment the document into n  * coherent topic segments. If n is -1, the algorithm * will decide the appropriate number of segments by * monitoring the rate of increase in segment density. * Creation date: (11/05/99 05:55:46) * @return String[][] A list of coherent topic segments * @param String[] A list of elementary text blocks (usually sentences). Each block is a string of space separated tokens. * @param n int Number of segments to make, if -1 then let the algorithm decide. * @param s int Size of ranking mask, must be >= 3 and an odd number */public final static String[][][] segment(final String[][] document, final int n, final int s) {	Debugx.msg("C99", "Context vectors...");	ContextVector[] vectors = normalize(document);	Debugx.msg("C99", "Similarity matrix...");	float[][] sim = similarity(vectors);	vectors = null;	Debugx.msg("C99", "Rank matrix (" + s + "x" + s + " rank mask)...");	float[][] rank = rank(sim, s);	sim = null;	Debugx.msg("C99", "Sum of rank matrix...");	float[][] sum = sum(rank);	rank = null;	Debugx.msg("C99", "Divisive clustering (" + (n==-1 ? "automatic" : "user") + " termination)...");	int[] B = Arrayx.sortAsc(boundaries(sum, n));	sum = null;	Debugx.msg("C99", "Found " + (B.length+1) + " segments...");	return split(document, B);}/** * Given a document as a list of elementary text blocks * (usually tokenised sentences), segment the document into n  * coherent topic segments. If n is -1, the algorithm * will decide the appropriate number of segments by * monitoring the rate of increase in segment density. * Creation date: (11/05/99 05:55:46) * @return String[][] A list of coherent topic segments * @param String[] A list of elementary text blocks (usually sentences). Each block is a string of space separated tokens. * @param n int Number of segments to make, if -1 then let the algorithm decide. * @param s int Size of ranking mask, must be >= 3 and an odd number */public final static String[][][] segmentW(final String[][] document, final int n, final int s) {	Debugx.msg("C99", "Context vectors...");	ContextVector tf = new ContextVector();	ContextVector[] vectors = normalize(document, tf);	Debugx.msg("C99", "Similarity matrix...");	EntropyVector ev = new EntropyVector(tf);	float[][] sim = similarity(vectors, ev);	vectors = null;	Debugx.msg("C99", "Rank matrix (" + s + "x" + s + " rank mask)...");	float[][] rank = rank(sim, s);	sim = null;	Debugx.msg("C99", "Sum of rank matrix...");	float[][] sum = sum(rank);	rank = null;	Debugx.msg("C99", "Divisive clustering (" + (n==-1 ? "automatic" : "user") + " termination)...");	int[] B = Arrayx.sortAsc(boundaries(sum, n));	sum = null;	Debugx.msg("C99", "Found " + (B.length+1) + " segments...");	return split(document, B);}/** * Given a list fo context vector, compute the similarity matrix * Creation date: (11/05/99 04:45:51) * @return float[][] * @param v uk.ac.man.cs.choif.extend.structure.ContextVector[] */private final static float[][] similarity(final ContextVector[] v) {	float[][] S = new float[v.length][v.length];		for (int i=v.length; i-->0;) {		for (int j=i+1; j-->0;) {			S[i][j] = ContextVector.cos(v[i], v[j]);			S[j][i] = S[i][j];		}	}	return S;}/** * Given a list fo context vector, compute the similarity matrix * Creation date: (11/05/99 04:45:51) * @return float[][] * @param v uk.ac.man.cs.choif.extend.structure.ContextVector[] */private final static float[][] similarity(final ContextVector[] v, final EntropyVector entropy) {	float[][] S = new float[v.length][v.length];		for (int i=v.length; i-->0;) {		for (int j=i+1; j-->0;) {			S[i][j] = ContextVector.cos(v[i], v[j], entropy);			S[j][i] = S[i][j];		}	}	return S;}/** * Given the input text and the topic boundaries, * split the text into segment blocks. * Creation date: (08/16/99 06:56:26) * @return String[][][] Topic segments * @param T String[][] Source text * @param B int[] Boundaries */private final static String[][][] split (final String[][] T, final int[] B) {	/* Add the implicit boundaries (start and end) to B */	int[] b = new int[B.length+2];	// A list of boundaries, includes implicit boundaries	b[0] = 0;	b[b.length-1] = T.length;	System.arraycopy(B, 0, b, 1, B.length);	/* Make the topic segments */	String[][][] seg = new String[b.length-1][][];	for (int i=seg.length; i-->0;) {		seg[i] = new String[b[i+1] - b[i]][];		System.arraycopy(T, b[i], seg[i], 0, b[i+1] - b[i]);	}	return seg;}/** * Compute the sum of rank matrix * Creation date: (11/05/99 04:56:32) * @return float[][] * @param M float[][] */private final static float[][] sum(final float[][] M) {	float[][] S = new float[M.length][M.length];	/* Step 1 */	for (int i=0, ie=M.length; i<ie; i++) S[i][i] = M[i][i];	/* Step 2 */	for (int i=0, ie=M.length-1, ip; i<ie; i++) {		ip = i+1;		S[ip][i] = M[ip][i] * 2 + S[i][i] + S[ip][ip];		S[i][ip] = S[ip][i];	}	/* Step 3 */	for (int j=2, ij, ip; j<M.length; j++) {		for (int i=0, ie=M.length-j; i<ie; i++) {			ij = i+j;			ip = i+1;			S[ij][i] = M[ij][i] * 2 + S[ij-1][i] + S[ij][ip] - S[ij-1][ip];			S[i][ij] = S[ij][i];		}	}	return S;}}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -