⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 entropy.java

📁 统计一篇文本文档中的熵
💻 JAVA
字号:
package yus.excerse1;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.StringTokenizer;

public class Entropy {

	/**
	 * @param args
	 * @throws IOException
	 */
	public static void main(String[] args) throws IOException {

		long start = System.currentTimeMillis();

		double chnEntropy = retChnEntropy();
		System.out.println("chnEntropy:" + chnEntropy);

		double engEntropy = retEngEntropy();
		System.out.println("engEntropy:" + engEntropy);

		long end = System.currentTimeMillis();
		System.out.println("time lasts " + (end - start) + "ms");
	}

	public static double retChnEntropy() throws IOException {

		BufferedReader br = new BufferedReader(new InputStreamReader(
				new FileInputStream("D:\\postGraduate\\chn.txt")));

		Object[][] hashTable = new Object[2000][2];
		int totalNum = 0;
		for (int i = 0; i < 2000; i++) {
			hashTable[i][1] = 0;
		}
		// ArrayList words = new ArrayList<Object>();
		long code;
		String line = br.readLine();
		while (line != null) {
			// System.out.println("line:" + line);
			StringTokenizer tokenizer = new StringTokenizer(line);
			while (tokenizer.hasMoreTokens()) {
				totalNum++;
				String word = tokenizer.nextToken();

				word = word.toLowerCase();
				code = getChnCode(word);
				if (hashTable[(int) code][1].equals((Object) 0)) {
					Object[] obj = new Object[2];
					obj[0] = word;
					obj[1] = 1;
					ArrayList hashList = new ArrayList<Object>();
					hashList.add(obj);
					hashTable[(int) code][0] = hashList;
					hashTable[(int) code][1] = 1;
				} else {
					ArrayList list = (ArrayList) hashTable[(int) code][0];
					boolean flag = false;
					for (Object obj : list) {
						Object[] o = (Object[]) obj;
						if (((String) o[0]).equals(word)) {
							int c = (Integer) o[1];
							c++;
							o[1] = c;
							obj = o;
							hashTable[(int) code][0] = list;
							flag = true;
							break;
						}
					}
					if (flag == false) {
						Object[] obj = new Object[2];
						obj[0] = word;
						obj[1] = 1;
						list.add(obj);
						hashTable[(int) code][0] = list;
					}
				}

			}
			line = br.readLine();
		}
		br.close();
		double entropy;
		entropy = getEntropy(hashTable, totalNum);
		return entropy;
	}

	private static long getChnCode(String word) {
		byte[] b = word.getBytes();
		long code = 1;
		if (b.length > 6) {
			for (byte c : b) {
				code += c;
			}
		} else {
			for (byte c : b) {
				code *= c;
			}
		}
		if (code < 0)
			code = -code;
		return code % 2000;
	}

	public static double retEngEntropy() throws IOException {

		BufferedReader br = new BufferedReader(new InputStreamReader(
				new FileInputStream("D:\\postGraduate\\eng.txt")));

		Object[][] hashTable = new Object[2000][2];
		int totalNum = 0;
		for (int i = 0; i < 2000; i++) {
			hashTable[i][1] = 0;
		}
		// ArrayList words = new ArrayList<Object>();
		long code;
		String line = br.readLine();
		while (line != null) {
			// System.out.println("line:" + line);
			StringTokenizer tokenizer = new StringTokenizer(line);
			while (tokenizer.hasMoreTokens()) {
				totalNum++;
				String word = tokenizer.nextToken();

				word = word.toLowerCase();
				code = getCode(word);
				if (hashTable[(int) code][1].equals((Object) 0)) {
					Object[] obj = new Object[2];
					obj[0] = word;
					obj[1] = 1;
					ArrayList hashList = new ArrayList<Object>();
					hashList.add(obj);
					hashTable[(int) code][0] = hashList;
					hashTable[(int) code][1] = 1;
				} else {
					ArrayList list = (ArrayList) hashTable[(int) code][0];
					boolean flag = false;
					for (Object obj : list) {
						Object[] o = (Object[]) obj;
						if (((String) o[0]).equals(word)) {
							int c = (Integer) o[1];
							c++;
							o[1] = c;
							obj = o;
							hashTable[(int) code][0] = list;
							flag = true;
							break;
						}
					}
					if (flag == false) {
						Object[] obj = new Object[2];
						obj[0] = word;
						obj[1] = 1;
						list.add(obj);
						hashTable[(int) code][0] = list;
					}
				}

			}
			line = br.readLine();
		}
		br.close();
		double entropy;
		entropy = getEntropy(hashTable, totalNum);
		return entropy;
	}

	private static double getEntropy(Object[][] hashTable, int totalNum) {
		double entropy = 0;
		int len = hashTable.length;
		for (int i = 0; i < len; i++) {
			if (hashTable[i][1].equals((Object) 1)) {
				ArrayList hashList = (ArrayList) hashTable[i][0];
				for (Object object : hashList) {
					Object[] obj = (Object[]) object;
					entropy -= ((Integer) obj[1]).doubleValue()
							/ totalNum
							* log(((Integer) obj[1]).doubleValue() / totalNum,
									2);
				}
			}
		}
		return entropy;
	}

	static public double log(double value, double base) {
		return Math.log(value) / Math.log(base);
	}

	private static long getCode(String s) {
		s = s.toLowerCase();
		int len = s.length();
		long code = 1;
		if (len < 6) {
			for (int i = 0; i < len; i++) {
				code = code * getAscii(s.charAt(i));
			}
		} else {
			for (int i = 0; i < len; i++) {
				code = code + getAscii(s.charAt(i));
			}
		}

		if (code < 0)
			return 0;
		else
			return code % 2000;
	}

	private static int getAscii(char c) {
		byte b = (byte) c;
		return b;

	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -