⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vsm.java

📁 用TFIDF和特征增益两种方式实现了特征向量空间的建立
💻 JAVA
字号:
package yus.baseline;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;

public class Vsm {

	final static String[] FILES = { "1.txt", "2.txt", "3.txt", "4.txt",
			"5.txt", "6.txt", "7.txt", "8.txt", "9.txt", "10.txt", "11.txt",
			"12.txt", "13.txt", "14.txt", "15.txt", "16.txt", "17.txt",
			"18.txt", "19.txt", "20.txt", "21.txt", "22.txt", "23.txt",
			"24.txt", "25.txt", "26.txt", "27.txt", "28.txt", "29.txt",
			"30.txt", "31.txt", "32.txt", "33.txt", "34.txt", "35.txt",
			"36.txt", "37.txt", "38.txt", "39.txt", "40.txt", "41.txt",
			"42.txt", "43.txt", "44.txt", "45.txt", "46.txt", "47.txt",
			"48.txt", "49.txt", "50.txt", "51.txt", "52.txt", "53.txt",
			"54.txt", "55.txt", "56.txt", "57.txt", "58.txt", "59.txt",
			"60.txt", "61.txt", "62.txt", "63.txt", "64.txt", "65.txt",
			"66.txt", "67.txt", "68.txt", "69.txt", "70.txt", "71.txt",
			"72.txt", "73.txt", "74.txt", "75.txt", "76.txt", "77.txt",
			"78.txt", "79.txt", "80.txt", "81.txt", "82.txt", "83.txt",
			"84.txt", "85.txt", "86.txt", "87.txt", "88.txt", "89.txt",
			"90.txt", "91.txt", "92.txt", "93.txt", "94.txt", "95.txt",
			"96.txt", "97.txt", "98.txt", "99.txt", "100.txt" };

	final static String StopWordFile = "stop_words_ch.txt";

	final static int Dimensionality = 100;

	final static double MIN = -9999.0;

	public static void main(String[] args) throws IOException {

		long startTime = System.currentTimeMillis();
		Map<String, Integer> wordInFileHM = new HashMap<String, Integer>();// N(f,w)
		Map<String, Integer> wordHM = new HashMap<String, Integer>();// N(w)
		Map<String, Integer> totalWordHM = new HashMap<String, Integer>();// 每个文件词的总数,所有文件词的总数
		Map<String, Double> valueHM = new HashMap<String, Double>();// 词语的信息增益
		List<String> wordList = new ArrayList<String>();
		wordList = statistic(wordInFileHM, wordHM, totalWordHM, wordList);

		computePlusValue(valueHM, wordInFileHM, wordHM, totalWordHM, wordList);

		selectFeatureByIG(wordList, valueHM);

		selectFeatureByTFIDF(wordList, wordInFileHM, totalWordHM);

		long endTime = System.currentTimeMillis();
		System.out.println("The spending time is: "
				+ String.valueOf(endTime - startTime) + " ms");
	}

	private static String[][][] selectFeatureByIG(List<String> wordList,
			Map<String, Double> valueHM) {
		String[] waitSelect = new String[wordList.size()];
		int i = 0;
		for (String word : wordList) {
			waitSelect[i++] = word;
		}
		px(waitSelect, valueHM);

		String[][][] featurePlus = new String[FILES.length][Dimensionality][2];
		for (int m = 0; m < FILES.length; m++) {
			for (int n = 0; n < Dimensionality; n++) {
				featurePlus[m][n][0] = waitSelect[n];
				if (valueHM.get(FILES[m] + "/" + waitSelect[n]) == null) {
					featurePlus[m][n][1] = String.valueOf(MIN);
				} else {
					featurePlus[m][n][1] = String.valueOf(valueHM.get(FILES[m]
							+ "/" + waitSelect[n]));
				}

			}
		}
		return featurePlus;
	}

	private static String[][][] selectFeatureByTFIDF(List<String> wordList,
			Map<String, Integer> wordInFileHM, Map<String, Integer> totalWordHM) {
		// TODO Auto-generated method stub
		String[] waitSelect = new String[wordList.size()];
		int i = 0;
		for (String word : wordList) {
			waitSelect[i++] = word;
		}

		String[][][] featureVSM = new String[FILES.length][Dimensionality][2];
		int[] nj = new int[wordList.size()];
		int k = 0;
		for (String word : wordList) {
			for (int j = 0; j < FILES.length; j++) {
				if (wordInFileHM.get(FILES[j] + "/" + word) != null) {
					nj[k]++;
				}
			}
			k++;
		}

		Map<String, Double> tfidfHM = new HashMap<String, Double>();
		int fileLen = FILES.length, wordLen = waitSelect.length;

		for (int m = 0; m < fileLen; m++) {
			for (int n = 0; n < wordLen; n++) {
				double subValue = 0;
				try {
					subValue = wordInFileHM.get(FILES[m] + "/" + waitSelect[n])
							* Math.log(totalWordHM.get(
									"totalWords" + "/" + "allFiles")
									.doubleValue()
									/ nj[n]);
				} catch (Exception e) {
					subValue = 0;
				}

				try {
					double v = tfidfHM.get(waitSelect[n]);
					tfidfHM.put(waitSelect[n], v + subValue);
				} catch (Exception e) {
					tfidfHM.put(waitSelect[n], subValue);
				}

				tfidfHM.put(FILES[m] + "/" + waitSelect[n], subValue);
			}
		}

		px(waitSelect, tfidfHM);

		for (int m = 0; m < fileLen; m++) {
			for (int n = 0; n < Dimensionality; n++) {
				featureVSM[m][n][0] = waitSelect[n];
				if (wordInFileHM.get(FILES[m] + "/" + waitSelect[n]) == null) {
					featureVSM[m][n][1] = String.valueOf(0);
				} else {
					featureVSM[m][n][1] = String.valueOf(tfidfHM
							.get(waitSelect[n]));
				}

			}
		}

		return featureVSM;
	}

	/*
	 * 排序
	 */
	private static void px(String[] waitSelect, Map<String, Double> valueHM) {
		// TODO Auto-generated method stub
		if (waitSelect.length <= 1)
			return;
		double d = valueHM.get(waitSelect[0]);
		String s = waitSelect[0];
		int i = 0, j = waitSelect.length - 1;
		while (i < j) {

			while (valueHM.get(waitSelect[j]) < d && i < j) {
				j--;
			}

			if (i < j) {
				waitSelect[i] = waitSelect[j];
				i++;
			}

			while (valueHM.get(waitSelect[i]) > d && i < j) {
				i++;
			}
			if (i < j) {
				waitSelect[j] = waitSelect[i];
				j--;
			}
		}
		waitSelect[i] = s;
		String[] s1 = new String[i];
		for (int k = 0; k < s1.length; k++) {
			s1[k] = waitSelect[k];
		}
		String[] s2 = new String[waitSelect.length - i - 1];
		for (int k = 0; k < s2.length; k++) {
			s2[k] = waitSelect[i + 1 + k];
		}

		px(s1, valueHM);
		px(s2, valueHM);
		for (int k = 0; k < s1.length; k++) {
			waitSelect[k] = s1[k];
		}
		for (int k = 0; k < s2.length; k++) {
			waitSelect[i + 1 + k] = s2[k];
		}
	}

	/*
	 * 计算信息增益
	 */
	private static void computePlusValue(Map<String, Double> valueHM,
			Map<String, Integer> wordInFileHM, Map<String, Integer> wordHM,
			Map<String, Integer> totalWordHM, List<String> wordList) {
		// TODO Auto-generated method stub
		int fileLen = FILES.length;
		for (int i = 0; i < fileLen; i++) {
			for (String word : wordList) {
				if (wordInFileHM.get(FILES[i] + "/" + word) != null) {
					double wf, w, nwf, nf;
					wf = wordInFileHM.get(FILES[i] + "/" + word).doubleValue();
					w = wordHM.get("total" + "/" + word).doubleValue();
					nwf = totalWordHM.get("totalWords" + "/" + FILES[i])
							.doubleValue()
							- wf;
					nf = totalWordHM.get("totalWords" + "/" + "allFiles")
							.doubleValue()
							- w;
					double subValue = wf * Math.log(wf / w * fileLen) + nwf
							* Math.log(nwf / nf * fileLen);
					try {
						double v = valueHM.get(word);
						valueHM.put(word, v + subValue);
					} catch (Exception e) {
						valueHM.put(word, subValue);
					}

					valueHM.put(FILES[i] + "/" + word, subValue);
				} else {
					double nwf = totalWordHM.get("totalWords" + "/" + FILES[i])
							.doubleValue();
					double nf = totalWordHM
							.get("totalWords" + "/" + "allFiles").doubleValue()
							- wordHM.get("total" + "/" + word).doubleValue();
					double subValue = nwf * Math.log(nwf / nf * fileLen);

					try {
						double v = valueHM.get(word);
						valueHM.put(word, v + subValue);
					} catch (Exception e) {
						valueHM.put(word, subValue);
					}

					valueHM.put(FILES[i] + "/" + word, subValue);
				}
			}
		}
	}

	private static List<String> statistic(Map<String, Integer> wordInFileHM,
			Map<String, Integer> wordHM, Map<String, Integer> totalWordHM,
			List<String> wordList) throws IOException {
		// TODO Auto-generated method stub!
		Map<String, Integer> stopWordHM = new HashMap<String, Integer>();
		getStopWordTable(stopWordHM);

		int fileLen = FILES.length;
		String path = new File("").getAbsolutePath() + "/text/desFile/";
		for (int i = 0; i < fileLen; i++) {
			BufferedReader br = new BufferedReader(new InputStreamReader(
					new FileInputStream(path + FILES[i])));
			String line = br.readLine();
			while (line != null) {
				if (!"".equals(line.trim())) {
					StringTokenizer tokenizer = new StringTokenizer(line);
					while (tokenizer.hasMoreTokens()) {
						String cluster = tokenizer.nextToken().trim();
						String[] a = cluster.split("/");

						if (stopWordHM.get(a[0]) == null) {
							a[0] = delQJFH(a[0]);
							if (!"".equals(a[0])) {
								addToDic(wordInFileHM, FILES[i], a[0].trim());
								addToDic(wordHM, "total", a[0].trim());
								addToDic(totalWordHM, "totalWords", FILES[i]);
								addToDic(totalWordHM, "totalWords", "allFiles");
								if (!wordList.contains(a[0])) {
									wordList.add(a[0]);
								}
							}

						}
					}
				}
				line = br.readLine();
			}
			br.close();
		}

		return delSmallWord(wordList, wordHM);
	}

	private static List<String> delSmallWord(List<String> wordList,
			Map<String, Integer> wordHM) {
		// TODO Auto-generated method stub
		int threshold = 10;
		List<String> list = new ArrayList<String>();
		for (String word : wordList) {
			if (wordHM.get("total/" + word) > threshold) {
				list.add(word);
			}
		}
		return list;
	}

	private static String delQJFH(String s) throws UnsupportedEncodingException {
		// TODO Auto-generated method stub
		try {
			while (isSymbol(s)) {
				s = s.substring(1);
			}
			return s;
		} catch (Exception e) {
			return "";
		}

	}

	private static boolean isSymbol(String s)
			throws UnsupportedEncodingException {
		byte[] b = s.getBytes("GBK");
		if (b[0] == -95)
			return true;
		else
			return false;
	}

	private static void addToDic(Map<String, Integer> hm, String file,
			String word) {
		// TODO Auto-generated method stub
		String s = file + "/" + word;
		try {
			int v = hm.get(s);
			hm.put(s, v + 1);
		} catch (Exception e) {
			hm.put(s, 1);
		}
	}

	private static void getStopWordTable(Map<String, Integer> stopWordHM)
			throws IOException {
		BufferedReader br = new BufferedReader(new InputStreamReader(
				new FileInputStream(StopWordFile)));
		String line = br.readLine();
		int i = 0;
		while (line != null) {
			stopWordHM.put(line.trim(), i++);
			line = br.readLine();
		}
		br.close();
	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -