📄 matric.java
字号:
package src.paper;
import java.io.*;
import java.util.*;
import org.apache.lucene.index.*;
/* 输出文档格式说明 */
/*
*.mat.rlabel : row label 每一行代表每一篇文档的编号 即:doc[row].docno
*.mat.clabel : column label 文档集合的词表 ,每一行代表矩阵中的一维,即所有在文档集合中出现的词构成所有的维。
*.mat 矩阵的表示文件,具体格式如下:
第1行:文档数(doc) 文档集合中所出现不同词(term)的总数 矩阵中不为零项(item)的总数
第2行:doc[2-1]的表示(对的集合,每个对是列值和对应的值,中间用空间分开)
……
第n行:doc[n-1]的表示
*/
public class Matric
{
public IndexReader reader = null;
public String indexPath = null;
public Matric(String indexPath){
try
{
//读索引文件
reader = IndexReader.open(indexPath);
this.indexPath = indexPath;
}
catch (Exception e)
{
e.printStackTrace(System.err);
}
}
//填充词表,并输出到 output
public HashMap<String, Integer> FillColumn(String output,String matricPath,String option)
{
HashMap<String,Integer> column = new HashMap<String,Integer>();
try
{
TermEnum termEnum = reader.terms();
FileWriter clabelWriter = new FileWriter(matricPath + output + "_" + option + ".mat.clabel");
int num = 1;
while (termEnum.next())
{
if( termEnum.term().field() == "content" )
{
column.put(termEnum.term().text(),new Integer(num++));
clabelWriter.write(termEnum.term().text()+"\n");
}
}
clabelWriter.close();
}
catch ( Exception e)
{
e.printStackTrace(System.err);
}
return column;
}
//返回存放矩阵的Path
public String FillRowAndMat(String output,String option)
{
String matricPath = indexPath.substring(0, indexPath.lastIndexOf("\\")+1);
HashMap<String,Integer> termNo = FillColumn(output,matricPath,option);
TermFreqVector termFreqVector = null;
int docsnum = reader.numDocs();//row number
int termsnum = termNo.size();//column number
int itemsnum = 0;//non zeros entries
int nullNum = 0;
//统计itemsnum
for ( int i = 0 ; i < docsnum ; i++ )
{
try {
/*Collection<String> list = new Vector<String>();
list = (Collection<String>) reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR);
for(String s:list)
System.out.println(s);
TermFreqVector[] tfv = reader.getTermFreqVectors(i);
if (tfv != null && tfv[0] != null)
size = tfv[0].size();
else
size = 0;
itemsnum = itemsnum + size;*/
termFreqVector = reader.getTermFreqVector(i, "content");
if(termFreqVector == null){
System.out.println((++nullNum) + ": null");
continue;
}
} catch (IOException e) {
e.printStackTrace();
}
int size = termFreqVector.size();
itemsnum = itemsnum + size;
}
String firstLine = (docsnum - nullNum) + " " + termsnum + " " + itemsnum + "\n";
BufferedWriter matricWriter = null;
BufferedWriter rlabelWriter = null;
try {
matricWriter = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(matricPath + output + "_" + option + ".mat"),"GBK"));
rlabelWriter = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(matricPath + output + "_" + option + ".mat.rlabel"),"GBK"));
matricWriter.write(firstLine);
System.out.println(firstLine);
} catch (Exception e) {
e.printStackTrace();
}
for ( int i = 0; i < docsnum ; i++ )
{
try {
termFreqVector = reader.getTermFreqVector(i, "content");
if(termFreqVector == null){
continue;
}
} catch (IOException e1) {
e1.printStackTrace();
}
//System.out.print(i + ".");
try {
String docno = reader.document(i).get("path");
rlabelWriter.write(docno+"\n");
} catch (IOException e1) {
e1.printStackTrace();
}
String[] terms = termFreqVector.getTerms();
int[] termFreq = termFreqVector.getTermFrequencies();
int size = termFreqVector.size();
StringBuffer matricLine = new StringBuffer();
for (int j = 0; j < size ; j++ )
{
//itemsnum++;
//用一行表示文档
matricLine.append(termNo.get(terms[j]).toString());
matricLine.append(" ");
matricLine.append(termFreq[j]);
matricLine.append(" ");
}
matricLine.append("\n");
try {
matricWriter.write(matricLine.toString());
} catch (IOException e) {
e.printStackTrace();
}
}
try {
matricWriter.close();
rlabelWriter.close();
}catch(Exception e){
e.printStackTrace();
}
System.out.println("\nnull docNum=" + (docsnum - nullNum) + "\ttermNum=" + termsnum + "\titemNum=" + itemsnum);
return matricPath;
}
public static void main(String[] args){
String indexPath = "F:\\navy\\Project\\论文\\2006News_index";
String matricPath = new Matric(indexPath).FillRowAndMat("","doc");
System.out.println(matricPath);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -