📄 textfreq.java
字号:
package textfreqnew;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
//import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
//import java.io.PrintWriter;
import java.util.*;
//import java.lang.*;
import java.lang.Math;
//import textfreq.CountedSet;
//import java.util.StringTokenizer;
/**
* 结合wordbase来计算每个文本中的词频,再计算wordbase中每个词的文档频率,然后根据词频和文档频率计算每个词在每个文档中的权重,把权重存储在指定的文本中.
* */
class Counter{
int i=1;
public String toString(){
return Integer.toString(i);
}
}
public class TextFreq{
private List reads=new ArrayList();
public static Map hm1=new HashMap();//存储词和词频
public static Map hm2=new HashMap();//存储词库
public static Map hm3=new HashMap();//存储文档频率
//public static int i=0;
public static int N=1560;
public Double d1;
public Double d2;
public Double W;
//public static Double log(Double N/d2);
/**
* @param args
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
// TODO Auto-generated method stub
String filedest="E:\\experiment\\trainstopwordnew1";
TextFreq f=new TextFreq();
File file=new File(filedest);
//读取filedest中的所有文件 并存贮在reads中
f.read(file.listFiles());
if(f.getReads()!=null)
{
f.read(f.getReads());
}
}
/**
* @param files
* 读取文件 但还没有读取文件的内容
*/
public void read(File []files){
for(int i=0;i<files.length;i++)
{
if(files[i].isDirectory())
{
this.read(files[i].listFiles());
}else
{
this.reads.add(files[i]);
}
}
}
//读取文件内容
public void read(List files) throws FileNotFoundException
{
//int i=0;
//CountedSet cs=new CountedSet();
Iterator iterator=files.iterator();
File file=null;
while(iterator.hasNext())
{
file=(File) iterator.next();
System.out.println("读取文件"+file.getName()+"内容");
LineNumberReader reader;
//System.out.println("avsdadfa");
try
{
reader = new LineNumberReader(new BufferedReader(new InputStreamReader(new FileInputStream(file))));
// BufferedReader inputStream=new BufferedReader(new FileReader("E:\\experiment\\wordbase2.txt"));
//System.out.println("avsdadfa");
String line1;
//String line2;
PrintWriter fos=new PrintWriter(new File("E:\\experiment\\textfreq\\"+file.getName()));
//FileOutputStream fos=new FileOutputStream(new File("E:\\test1\\textfreq\\"+file.getName()));
//BufferedOutputStream fos2=new BufferedOutputStream(new FileOutputStream("E:\\test1\\wordfreq.txt"));
// BufferedOutputStream fos3=new BufferedOutputStream(new FileOutputStream("E:\\test1\\wordwight.txt"));
//统计每个文本中的词频,并把词和相应的词频存放到hm1中
while((line1=reader.readLine())!=null){
if(hm1.containsKey(line1))
((Counter)hm1.get(line1)).i++;
else
hm1.put(line1, new Counter());
}
// 输出hm1,测试是否成功存储到hm1中
//System.out.println(hm1);
// 把词频写入到文本中
//Iterator iter=cs.getMap().keySet().iterator();
// Iterator iter=cs.getMap().values().iterator();
Set keySet=hm1.keySet();
String text;
int count;
for(Iterator iter=hm1.keySet().iterator();iter.hasNext();){
text=(String)iter.next();
count=((Counter)hm1.get(text)).i;
//count=(hm1.get(text)).
fos.write(text+":"+count+"\r\n");
}
fos.close();
hm1.clear();
//把词库存储到hm2中
/* while((line2=inputStream.readLine())!=null){
if(!(hm2.containsKey(line2)))
hm2.put(line2, new Counter());
else
((Counter)hm2.get(line2)).i++;
}
*/
//统计文档频率,把hm1中的词与wordbase(hm2)中的词进行比较,如果hm1中有wordbase中的词,则加1
/* Iterator iter2=hm2.keySet().iterator();
Iterator iter1=hm1.keySet().iterator();
while(iter2.hasNext()){
String s2=iter2.next().toString();
if(hm1.containsKey(s2)){
if(!(hm3.containsKey(s2))){
hm3.put(s2, new Counter());
}
else{
((Counter)hm3.get(s2)).i++;
}
}
}*/
//输出hm3,测试文档频率
//System.out.println(hm3);
//计算权重,W=tf*idf,其中tf=cs.getMap().values(),df=hm3.values(),idf=log(N/df)
/*Iterator iter31=hm3.keySet().iterator();//文档频率
Iterator iter11=hm1.keySet().iterator();//词频
while((iter31.hasNext())&(iter11.hasNext())){
String s3=iter31.next().toString();
String s4=iter11.next().toString();
if(s3.equals(s4)){
d1=Double.valueOf(hm1.get(s4).toString());//词s3的词频值
d2=Double.valueOf(hm3.get(s3).toString());//词s3的文档数
Double a=java.lang.Math.log(N/d2);
W=d1*a;
//W=d1*log(N/d2);
//System.out.println("asd:-----------------------");
//System.out.println(W);
String sw=W.toString()+"\r\n";
byte[] bw=sw.getBytes();
fos3.write(bw,0,bw.length);
//算法没有问题,但是把w存储到fos3中时,被复写,正在找原因。。。。。
}
//hm3.clear();
}
fos3.close();*/
//hm1.clear();
//hm2.clear();
//hm3.clear();
//hm3.clear();
//cs.getMap().clear();
}
catch (IOException e)
{
System.out.println("Install tips are not found!");
e.printStackTrace();
}
}
}
public List getReads() {
return reads;
}
public void setReads(List reads) {
this.reads = reads;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -