📄 poiofficeextractor.java

📁 《lucene+nutch搜索引擎开发》源代码
💻 JAVA
字号:
package chapter9;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hwpf.extractor.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.*;
import org.apache.poi.hwpf.usermodel.*;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;

//import LuceneBook.ChineseAnalyzer;

import java.io.FileInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;


public class POIOfficeExtractor {

	public static String xlsfileToBeRead="D:\\workshop\\docs\\books.xls";
	public static String docfileToBeRead="D:\\workshop\\docs\\softwarerequest.doc";
	
	private static String Dest_Index_Path = "D:\\workshop\\index";

	
	/*================================================================
	 * 名 称：DocQueryIndex
	 * 功 能：构造Doc文档检索查询器，对指定的索引进行查询。
	 ===============================================================*/
	public static void DocQueryIndex(){
		
		try {
			IndexSearcher searcher = new IndexSearcher(Dest_Index_Path); // 生成检索器对象
						
			Term term = new Term("名称","doc");                      // 检索关键字
			Query query = new TermQuery(term);                           // 生成检索对象
			System.out.println("----------检索内容："+query.toString()+"----------");	

			Hits hits = searcher.search(query);                          // 提交检索
			
			System.out.println("----------检索结果: 共检索到 "+hits.length()+" 条 ----------");		
			
			for(int i=0; i < hits.length(); i++)                         // 获得结果
			{
				System.out.println(hits.doc(i));
				System.out.println(hits.doc(i).getField("id"));			
			}
			
		}catch (IOException e) {
				e.printStackTrace();
		}

		System.out.println("----------索引检索：PDF索引查询成功----------");		
		
	}	
	/*================================================================
	 * 名 称：DocIndexBuilder
	 * 功 能：构造Doc磁盘索引，添加内容到指定目录，为后续检索查询做好准备。
	 ===============================================================*/
	public static void DocIndexBuilder(){
		
		try {
			
			//Analyzer TextAnalyzer = new StandardAnalyzer();                   // 生成分析器
			Analyzer TextAnalyzer = new ChineseAnalyzer();
			
			IndexWriter TextIndex = new IndexWriter(Dest_Index_Path,TextAnalyzer,true); // 生成索引器
	        TextIndex.setUseCompoundFile(true);
				
			Document document = new Document() ;                              // 由Office文件生成文档对象
			
		    FileInputStream in = new FileInputStream(new File(docfileToBeRead));
		     
		    HWPFDocument doc = new HWPFDocument(in);
		    Range range = doc.getRange();
		    String text = range.text(); 
		    
			Field field_doc = new Field("doc", text, 
					Field.Store.YES,Field.Index.TOKENIZED);
			document.add(field_doc);
			System.out.println("----------创建索引：Office 文件内容  ----------");		

			//System.out.println(document);
			TextIndex.addDocument(document);                                // 添加文档到索引

			TextIndex.optimize();
			TextIndex.close();                                              // 索引完毕
		
		}catch (IOException e) {
				e.printStackTrace();
		}

		System.out.println("----------创建索引：Office 文件成功. ----------");		
	}
	
	
	public static void GetWordDetail(String filename) throws Exception
	{
	     FileInputStream in = new FileInputStream(new File(filename));
	     
	     HWPFDocument doc = new HWPFDocument(in);
	     Range range = doc.getRange();
	     String text = range.text(); 

		for(int i=0;i < range.numParagraphs();i++ ){ 
			Paragraph p = range.getParagraph(i);   //取得每个段落 
			//组合文字并添加换行 
			text = p.text(); 
			text.trim();
			//text = " <br> " ;
		    System.out.println( text );
		}
	}
	
	// 获取doc内纯文本信息
	public static void GetWordContent(String filename) throws Exception
	{
	     FileInputStream in = new FileInputStream(new File(filename));
	     
	     WordExtractor extractor  = new WordExtractor(in);   // 创建WordExtractor
	     String text = extractor.getText();                  // 对DOC文件进行提取
	     System.out.println( text );
	}

	public static void GetWordHWPFDocument(String filename) throws Exception
	{	
		InputStream in = new FileInputStream(new File( "c:\\test.doc ")); //流入doc文档 
		HWPFDocument wordDocument = new HWPFDocument(in); //通过流得到文档类型 
		Range range = wordDocument.getRange(); //取得文档篇幅 
		int total = range.numParagraphs();     //文档内的总段落数 
		String content = " ";//文章内容 
		for(int i=0;i <total;i++ ){ 
		Paragraph p = range.getParagraph(i);   //取得每个段落 
		//组合文字并添加换行 
		content = p.text(); 
		content = " <br> " ;
		}
	} 

	// 获取Excel内纯文本信息
	public static void GetExcelContent(String filename) throws Exception
	{
		// 创建对指定Excel工作文件的引用
		HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(filename));
		HSSFSheet sheet = workbook.getSheetAt(0); 	             // 创建对工作表的引用。

		for( int i =0 ; i < workbook.getNumberOfSheets() ; i++ ) // 循环取表单对象
		{
			System.out.print("########## sheet:--" + i + " --########## " );
			sheet = workbook.getSheetAt(i);                      // 查阅文档的Sheet属性
			if( sheet != null )
			{
				for(int m = 0; m < sheet.getLastRowNum(); m++ )  //  按行循环取行对象
				{
					HSSFRow row = sheet.getRow(m);
					if( row == null){ break;}
					System.out.println("");
					if(row.getLastCellNum() <= 0) break;
					System.out.println(  "-----line:--" + m + " ---- ,col num:" 
							           + row.getLastCellNum());
					for(int n = 0; n < row.getLastCellNum(); n++) // 按列循环取单元格对象
					{
						HSSFCell cell = row.getCell((short)n);

						if( cell == null){	break; }
						int type = cell.getCellType();
						switch(type)
						{     case 0:
									System.out.print( cell.getNumericCellValue() + " , "); 
									break;
						      case 1:
									System.out.print(cell.getStringCellValue() + " , "); 
									break;
						      case 2:
									break;
						      case 3:
									System.out.print( " , "); 
									break;
							  default:
								  System.out.print("未知的单元类型" + type+" , ");
						}
					}
				}
			}
			System.out.println();			
		}
	}
	
	public static void main(String argv[]){ 
	try{

		//GetExcelContent(xlsfileToBeRead);
		//GetWordContent(docfileToBeRead);
		GetWordDetail(docfileToBeRead);
		
		//DocIndexBuilder();
		//DocQueryIndex();
	   }catch(Exception e) {
	   System.out.println("运行错误 : " + e );
  }
 }
	
	
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -