⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 matric.java

📁 用于文本的聚类
💻 JAVA
字号:
package src.paper;

import java.io.*;
import java.util.*;

import org.apache.lucene.index.*;

/*	输出文档格式说明	*/
/*
	*.mat.rlabel : row label 每一行代表每一篇文档的编号 即:doc[row].docno
	*.mat.clabel : column label 文档集合的词表 ,每一行代表矩阵中的一维,即所有在文档集合中出现的词构成所有的维。

	*.mat 矩阵的表示文件,具体格式如下:
	第1行:文档数(doc) 文档集合中所出现不同词(term)的总数 矩阵中不为零项(item)的总数

	第2行:doc[2-1]的表示(对的集合,每个对是列值和对应的值,中间用空间分开)
	 ……
	第n行:doc[n-1]的表示	
*/

public class Matric
{
	public IndexReader reader = null;
	public String indexPath = null;
	
	public Matric(String indexPath){
		try
		{
			//读索引文件
			reader = IndexReader.open(indexPath);
			this.indexPath = indexPath;
		}
		catch (Exception e)
		{
			e.printStackTrace(System.err);
		}
	}
	
	//填充词表,并输出到 output
	public HashMap<String, Integer> FillColumn(String output,String matricPath,String option)
	{
		HashMap<String,Integer> column = new HashMap<String,Integer>();

		try
		{
			TermEnum termEnum = reader.terms();
			FileWriter clabelWriter = new FileWriter(matricPath + output + "_" + option + ".mat.clabel");
			int num = 1;
			while (termEnum.next())
			{
				if( termEnum.term().field() == "content" )
				{
					column.put(termEnum.term().text(),new Integer(num++));
					clabelWriter.write(termEnum.term().text()+"\n");	
				}				
			}		
			
			clabelWriter.close();			
		}
		catch ( Exception e)
		{
			e.printStackTrace(System.err);
		}

		return column;
	}

	//返回存放矩阵的Path
	public String FillRowAndMat(String output,String option)
	{
		String matricPath = indexPath.substring(0, indexPath.lastIndexOf("\\")+1);
		
		HashMap<String,Integer> termNo = FillColumn(output,matricPath,option);
		TermFreqVector termFreqVector = null;
		int docsnum = reader.numDocs();//row number
		int termsnum = termNo.size();//column number
		int itemsnum = 0;//non zeros entries
		int nullNum = 0;
		
		//统计itemsnum
		for ( int i = 0 ; i < docsnum ; i++ )
		{
			
			try {
				/*Collection<String> list = new Vector<String>();
				list = (Collection<String>) reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR);
				for(String s:list)
					System.out.println(s);
				
				TermFreqVector[] tfv = reader.getTermFreqVectors(i);
				
				if (tfv != null && tfv[0] != null)
				    size = tfv[0].size();
				else 
					size = 0;
				
				itemsnum = itemsnum + size;*/
				
				termFreqVector = reader.getTermFreqVector(i, "content");
				if(termFreqVector == null){
					System.out.println((++nullNum) + ":  null");
					continue;
				}
				} catch (IOException e) {
				e.printStackTrace();
			}
			
			int size = termFreqVector.size();
			itemsnum = itemsnum + size;
		}
		
		String firstLine = (docsnum - nullNum) + " " + termsnum + " " + itemsnum + "\n";

		BufferedWriter matricWriter = null;
		BufferedWriter rlabelWriter = null;
		try {
			matricWriter = new BufferedWriter(
				new OutputStreamWriter(new FileOutputStream(matricPath + output + "_" + option + ".mat"),"GBK"));
			rlabelWriter = new BufferedWriter(
				new OutputStreamWriter(new FileOutputStream(matricPath + output + "_" + option + ".mat.rlabel"),"GBK"));
			
			matricWriter.write(firstLine);
			System.out.println(firstLine);
		} catch (Exception e) {
			e.printStackTrace();
		} 
		
		for ( int i = 0; i < docsnum ; i++ )
		{
			
				try {
					termFreqVector = reader.getTermFreqVector(i, "content");
					if(termFreqVector == null){
						continue;
					}
				} catch (IOException e1) {
					e1.printStackTrace();
				}
				
				//System.out.print(i + ".");
				
				try {
					String docno = reader.document(i).get("path");
					rlabelWriter.write(docno+"\n");
				} catch (IOException e1) {
					e1.printStackTrace();
				}
 
			String[] terms = termFreqVector.getTerms();
			int[] termFreq = termFreqVector.getTermFrequencies();
			int size = termFreqVector.size();

			StringBuffer matricLine = new StringBuffer();

			for (int j = 0; j < size ; j++ )
			{
				//itemsnum++;
				//用一行表示文档
			
				matricLine.append(termNo.get(terms[j]).toString());
				matricLine.append(" ");
				matricLine.append(termFreq[j]);
				matricLine.append(" ");
			}
			matricLine.append("\n");
			try {
				matricWriter.write(matricLine.toString());
			} catch (IOException e) {
				e.printStackTrace();
			}
		}

		try {
			matricWriter.close();
			rlabelWriter.close();
		}catch(Exception e){
			e.printStackTrace();
		}		
		
		System.out.println("\nnull docNum=" + (docsnum - nullNum) + "\ttermNum=" + termsnum + "\titemNum=" + itemsnum);
		
		return matricPath;
	}
	
	public static void main(String[] args){
		String indexPath = "F:\\navy\\Project\\论文\\2006News_index";
		String matricPath = new Matric(indexPath).FillRowAndMat("","doc");
		System.out.println(matricPath);
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -