⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 artificial.java

📁 一个朴素贝叶斯文本分类
💻 JAVA
字号:
import java.util.Hashtable;
import java.util.Vector;
import java.io.*;
public class Artificial{
    public static final String[] ENGLISH_STOP_WORDS = {
    "a", "an", "and", "are", "am", "as", "at", "be", "but", "by",
    "for", "i", "if", "in", "into", "is", "it", "do",
    "no", "not", "of", "on", "or", "s", "such",
    "t", "that", "the", "their", "then", "there", "these",
    "they", "this", "to", "was", "will", "with","nbsp",
    "pm","td","dropitem","dropdown","hover","drophead","td",
    "you","your"};
	public static Hashtable ht = new Hashtable();
	public static Vector chem = new Vector();		//for chemistry,kind 0
	public static Vector cs = new Vector();			//for cs,kind 1
	public static Vector stopwords=new Vector();
	public static int[]totalWords={0,0};
	public static int[]totalFiles={0,0};
	public static double[]P={0,0};
	public static int m=0;//=500;						//m
	public static int N;						//ȥǰNʸߵ
	public static void main(String args[]) {
		try{
			InitStopwords();
			Recursion(new File("AI Html Resource\\Chemistry"),chem,0);
			ht.clear();
			Recursion(new File("AI Html Resource\\Computer Science"),cs,1);
			ht.clear();
//			LearnNaiveBayesText(chem,0);
			QuickSort(chem,0,chem.size()-1);
			System.out.println("\nChemistry....");
			System.out.println("TotalWords="+totalWords[0]);
			System.out.println("TotalFiles="+totalFiles[0]);
			System.out.println("\tTop Frequency 20");
			for(int j=0;j<20;j++){
				keyAndCount kac=(keyAndCount)chem.elementAt(j);
				System.out.print(kac.key+",");
			}			
//			LearnNaiveBayesText(cs,1);
			QuickSort(cs,0,cs.size()-1);
			System.out.println("\n\nComputer Science....");
			System.out.println("TotalWords="+totalWords[1]);
			System.out.println("TotalFiles="+totalFiles[1]);
			System.out.println("\tTop Frequency 20");
			for(int j=0;j<20;j++){
				keyAndCount kac=(keyAndCount)cs.elementAt(j);
				System.out.print(kac.key+",");
			}
			//NIJͬȡֵ	
			int[] NN={0,30,50,100,300,500,1000};
			//mIJͬȡֵ
			int[] mm={10,100,500,1000,3000};
			for(int i=0;i<mm.length;i++){
				m=mm[i];
				LearnNaiveBayesText(chem,0);
				LearnNaiveBayesText(cs,1);
				System.out.println("\nm="+mm[i]);
			for(int k=0;k<NN.length;k++){
				
				N=NN[k];			//NֵΪN[kk]
				
				int chemistry=0;
				int computer=0;
				int n=0;
				File dir=new File("test_for_chem");
				if(dir.isDirectory()){
				File[] files=dir.listFiles();
				for(int s=0;s<files.length;s++){
					boolean b=ClassifyNaiveBayesText(files[s].getAbsoluteFile());
					if(b){
						chemistry++;
					}
					else {
						computer++;
					}
				}
					System.out.println("N="+NN[k]);
					System.out.println("chem files="+chemistry+"\tcs files="+computer);
					System.out.print("chem rate="+(double)chemistry/files.length);
					System.out.print("\tcs rate="+(double)computer/files.length+"\n");
				}
			}
			}
		}
		catch(Exception e){e.printStackTrace();}
	}
	
	public static void InitStopwords(){
		for(int k=0;k<ENGLISH_STOP_WORDS.length;k++)
			stopwords.add(ENGLISH_STOP_WORDS[k]);
	}
	
	public static void Recursion(File file,Vector vt,int kind){
		if(file.canRead()){
			if(file.isFile()){
				System.out.println("add..."+file.toString());
				totalFiles[kind]++;
				LoadfileAndStat(file,vt,kind);
			}
			else{
	    	    File[] files = file.listFiles();
	    	    for(int i=0;i<files.length;i++){
	    	    	Recursion(files[i].getAbsoluteFile(),vt,kind);
	    	    }
			}
		}
	}
	
	public static void LoadfileAndStat(File file,Vector vt,int kind){
		try{
			Vector v;
			keyAndCount kac;
			BufferedReader in = new BufferedReader(new FileReader(file));
			String str;
			str = in.readLine().trim();
			while(str!=null){
				try{
					str=HtmlFilter.TagRemove(str);
					if(!str.equals("")){
					Analyze a=new Analyze(str);
					v=a.analyze();
					for(int i=0;i<v.size();i++){
						object ob=(object)v.elementAt(i);
						//ĸIJҪ
						if(ob.string.length()>1&&stopwords.contains(ob.string.toLowerCase())==false&&ob.kind==1){
							totalWords[kind]++;
							Integer n=(Integer)ht.get(ob.string.toLowerCase());
							if(n!=null){
								kac=(keyAndCount)vt.elementAt(n.intValue());
								kac.count++;
							}
							else{
								kac=new keyAndCount();
								kac.key=ob.string.toLowerCase();
								kac.count=1;
								vt.add(kac);
								ht.put(ob.string.toLowerCase(),new Integer((vt.size()-1)));
							}
						}
					}
					}
					str=in.readLine().trim();
				}
				catch(Exception  ex){break;}
			}
			in.close();			
		}
		catch(Exception ex){ex.printStackTrace();}
	}
	
	public static void LearnNaiveBayesText(Vector vt,int kind){
		keyAndCount temp;
		P[kind]=(double)totalFiles[kind]/(totalFiles[0]+totalFiles[1]);
		for(int i=0;i<vt.size();i++){
			temp=(keyAndCount)vt.elementAt(i);
			temp.frequency=((double)temp.count+m/vt.size())/(totalWords[kind]+m);
		}
	}
	
	public static boolean ClassifyNaiveBayesText(File fileToClassify){
		double[] Vnb=new double[2];
		Vnb[0]=Math.log(P[0]);
		Vnb[1]=Math.log(P[1]);
		
		Vector v;
		keyAndCount kac;
		Hashtable[] table=new Hashtable[2];
		table[0]=new Hashtable();
		table[1]=new Hashtable();
		
		double frequencyNullChem=((double)m/chem.size())/(totalWords[0]+m);	//
		double frequencyNullCS=((double)m/cs.size())/(totalWords[1]+m);
		//ʼϣ
		int i;
		for(i=N;i<chem.size();i++){
			keyAndCount temp=(keyAndCount)chem.elementAt(i);
			table[0].put(temp.key,new Integer(i));
		}
		for(i=N;i<cs.size();i++){
			keyAndCount temp=(keyAndCount)cs.elementAt(i);
			table[1].put(temp.key,new Integer(i));
		}
		
		//ļ
		
		try{
			BufferedReader in = new BufferedReader(new FileReader(fileToClassify));
			String str;
			str = in.readLine().trim();
			while(str!=null){
				try{					
					str=HtmlFilter.TagRemove(str);
					if(!str.equals("")){
					Analyze a=new Analyze(str);
					v=a.analyze();
					for(i=0;i<v.size();i++){
						object ob=(object)v.elementAt(i);
						//ĸIJҪ
						if(ob.string.length()>1&&stopwords.contains(ob.string.toLowerCase())==false&&ob.kind==1){
							Integer n0=(Integer)table[0].get(ob.string.toLowerCase());
							if(n0!=null){
								kac=(keyAndCount)chem.elementAt(n0.intValue());
								Vnb[0]+=Math.log(kac.frequency);
							}
							//ʻв
							else{
								Vnb[0]+=Math.log(frequencyNullChem);
							}
							Integer n1=(Integer)table[1].get(ob.string.toLowerCase());
							if(n1!=null){
								kac=(keyAndCount)cs.elementAt(n1.intValue());
								Vnb[1]+=Math.log(kac.frequency);
							}
							else{
								Vnb[1]+=Math.log(frequencyNullCS);
							}
						}
					}
					}
					str=in.readLine();
				}
				catch(Exception eeee){break;}				
			}
			in.close();
		}
		catch(Exception exd){}
		return Vnb[0]>Vnb[1]?true:false;
	}
	
	//
	public static void QuickSort(Vector vt,int left,int right){
		int i,j,tempCount;
		String tempKey;
		keyAndCount middle,tempi,tempj;
		i=left;
		j=right;
		
		middle=(keyAndCount)vt.elementAt((left+right)/2);
				
		do{
			while((((keyAndCount)vt.elementAt(i)).count>middle.count)&&(i<right))i++;
			
			while((((keyAndCount)vt.elementAt(j)).count<middle.count)&&(j>left))j--;
			
			if(i<=j){
				tempi=(keyAndCount)vt.elementAt(i);
				vt.setElementAt((keyAndCount)vt.elementAt(j),i);
				vt.setElementAt(tempi,j);
				i++; 
				j--; 
			}
		}while(i<=j);
		
		if(left<j)
			QuickSort(vt,left,j);
		if(right>i)
			QuickSort(vt,i,right);
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -