📄 artificial.java
字号:
import java.util.Hashtable;
import java.util.Vector;
import java.io.*;
public class Artificial{
public static final String[] ENGLISH_STOP_WORDS = {
"a", "an", "and", "are", "am", "as", "at", "be", "but", "by",
"for", "i", "if", "in", "into", "is", "it", "do",
"no", "not", "of", "on", "or", "s", "such",
"t", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with","nbsp",
"pm","td","dropitem","dropdown","hover","drophead","td",
"you","your"};
public static Hashtable ht = new Hashtable();
public static Vector chem = new Vector(); //for chemistry,kind 0
public static Vector cs = new Vector(); //for cs,kind 1
public static Vector stopwords=new Vector();
public static int[]totalWords={0,0};
public static int[]totalFiles={0,0};
public static double[]P={0,0};
public static int m=0;//=500; //m
public static int N; //ȥǰNʸߵ
public static void main(String args[]) {
try{
InitStopwords();
Recursion(new File("AI Html Resource\\Chemistry"),chem,0);
ht.clear();
Recursion(new File("AI Html Resource\\Computer Science"),cs,1);
ht.clear();
// LearnNaiveBayesText(chem,0);
QuickSort(chem,0,chem.size()-1);
System.out.println("\nChemistry....");
System.out.println("TotalWords="+totalWords[0]);
System.out.println("TotalFiles="+totalFiles[0]);
System.out.println("\tTop Frequency 20");
for(int j=0;j<20;j++){
keyAndCount kac=(keyAndCount)chem.elementAt(j);
System.out.print(kac.key+",");
}
// LearnNaiveBayesText(cs,1);
QuickSort(cs,0,cs.size()-1);
System.out.println("\n\nComputer Science....");
System.out.println("TotalWords="+totalWords[1]);
System.out.println("TotalFiles="+totalFiles[1]);
System.out.println("\tTop Frequency 20");
for(int j=0;j<20;j++){
keyAndCount kac=(keyAndCount)cs.elementAt(j);
System.out.print(kac.key+",");
}
//NIJͬȡֵ
int[] NN={0,30,50,100,300,500,1000};
//mIJͬȡֵ
int[] mm={10,100,500,1000,3000};
for(int i=0;i<mm.length;i++){
m=mm[i];
LearnNaiveBayesText(chem,0);
LearnNaiveBayesText(cs,1);
System.out.println("\nm="+mm[i]);
for(int k=0;k<NN.length;k++){
N=NN[k]; //NֵΪN[kk]
int chemistry=0;
int computer=0;
int n=0;
File dir=new File("test_for_chem");
if(dir.isDirectory()){
File[] files=dir.listFiles();
for(int s=0;s<files.length;s++){
boolean b=ClassifyNaiveBayesText(files[s].getAbsoluteFile());
if(b){
chemistry++;
}
else {
computer++;
}
}
System.out.println("N="+NN[k]);
System.out.println("chem files="+chemistry+"\tcs files="+computer);
System.out.print("chem rate="+(double)chemistry/files.length);
System.out.print("\tcs rate="+(double)computer/files.length+"\n");
}
}
}
}
catch(Exception e){e.printStackTrace();}
}
public static void InitStopwords(){
for(int k=0;k<ENGLISH_STOP_WORDS.length;k++)
stopwords.add(ENGLISH_STOP_WORDS[k]);
}
public static void Recursion(File file,Vector vt,int kind){
if(file.canRead()){
if(file.isFile()){
System.out.println("add..."+file.toString());
totalFiles[kind]++;
LoadfileAndStat(file,vt,kind);
}
else{
File[] files = file.listFiles();
for(int i=0;i<files.length;i++){
Recursion(files[i].getAbsoluteFile(),vt,kind);
}
}
}
}
public static void LoadfileAndStat(File file,Vector vt,int kind){
try{
Vector v;
keyAndCount kac;
BufferedReader in = new BufferedReader(new FileReader(file));
String str;
str = in.readLine().trim();
while(str!=null){
try{
str=HtmlFilter.TagRemove(str);
if(!str.equals("")){
Analyze a=new Analyze(str);
v=a.analyze();
for(int i=0;i<v.size();i++){
object ob=(object)v.elementAt(i);
//ĸIJҪ
if(ob.string.length()>1&&stopwords.contains(ob.string.toLowerCase())==false&&ob.kind==1){
totalWords[kind]++;
Integer n=(Integer)ht.get(ob.string.toLowerCase());
if(n!=null){
kac=(keyAndCount)vt.elementAt(n.intValue());
kac.count++;
}
else{
kac=new keyAndCount();
kac.key=ob.string.toLowerCase();
kac.count=1;
vt.add(kac);
ht.put(ob.string.toLowerCase(),new Integer((vt.size()-1)));
}
}
}
}
str=in.readLine().trim();
}
catch(Exception ex){break;}
}
in.close();
}
catch(Exception ex){ex.printStackTrace();}
}
public static void LearnNaiveBayesText(Vector vt,int kind){
keyAndCount temp;
P[kind]=(double)totalFiles[kind]/(totalFiles[0]+totalFiles[1]);
for(int i=0;i<vt.size();i++){
temp=(keyAndCount)vt.elementAt(i);
temp.frequency=((double)temp.count+m/vt.size())/(totalWords[kind]+m);
}
}
public static boolean ClassifyNaiveBayesText(File fileToClassify){
double[] Vnb=new double[2];
Vnb[0]=Math.log(P[0]);
Vnb[1]=Math.log(P[1]);
Vector v;
keyAndCount kac;
Hashtable[] table=new Hashtable[2];
table[0]=new Hashtable();
table[1]=new Hashtable();
double frequencyNullChem=((double)m/chem.size())/(totalWords[0]+m); //
double frequencyNullCS=((double)m/cs.size())/(totalWords[1]+m);
//ʼϣ
int i;
for(i=N;i<chem.size();i++){
keyAndCount temp=(keyAndCount)chem.elementAt(i);
table[0].put(temp.key,new Integer(i));
}
for(i=N;i<cs.size();i++){
keyAndCount temp=(keyAndCount)cs.elementAt(i);
table[1].put(temp.key,new Integer(i));
}
//ļ
try{
BufferedReader in = new BufferedReader(new FileReader(fileToClassify));
String str;
str = in.readLine().trim();
while(str!=null){
try{
str=HtmlFilter.TagRemove(str);
if(!str.equals("")){
Analyze a=new Analyze(str);
v=a.analyze();
for(i=0;i<v.size();i++){
object ob=(object)v.elementAt(i);
//ĸIJҪ
if(ob.string.length()>1&&stopwords.contains(ob.string.toLowerCase())==false&&ob.kind==1){
Integer n0=(Integer)table[0].get(ob.string.toLowerCase());
if(n0!=null){
kac=(keyAndCount)chem.elementAt(n0.intValue());
Vnb[0]+=Math.log(kac.frequency);
}
//ʻв
else{
Vnb[0]+=Math.log(frequencyNullChem);
}
Integer n1=(Integer)table[1].get(ob.string.toLowerCase());
if(n1!=null){
kac=(keyAndCount)cs.elementAt(n1.intValue());
Vnb[1]+=Math.log(kac.frequency);
}
else{
Vnb[1]+=Math.log(frequencyNullCS);
}
}
}
}
str=in.readLine();
}
catch(Exception eeee){break;}
}
in.close();
}
catch(Exception exd){}
return Vnb[0]>Vnb[1]?true:false;
}
//
public static void QuickSort(Vector vt,int left,int right){
int i,j,tempCount;
String tempKey;
keyAndCount middle,tempi,tempj;
i=left;
j=right;
middle=(keyAndCount)vt.elementAt((left+right)/2);
do{
while((((keyAndCount)vt.elementAt(i)).count>middle.count)&&(i<right))i++;
while((((keyAndCount)vt.elementAt(j)).count<middle.count)&&(j>left))j--;
if(i<=j){
tempi=(keyAndCount)vt.elementAt(i);
vt.setElementAt((keyAndCount)vt.elementAt(j),i);
vt.setElementAt(tempi,j);
i++;
j--;
}
}while(i<=j);
if(left<j)
QuickSort(vt,left,j);
if(right>i)
QuickSort(vt,i,right);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -