📄 poiofficeextractor.java
字号:
package chapter9;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hwpf.extractor.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.*;
import org.apache.poi.hwpf.usermodel.*;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;
//import LuceneBook.ChineseAnalyzer;
import java.io.FileInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
public class POIOfficeExtractor {
public static String xlsfileToBeRead="D:\\workshop\\docs\\books.xls";
public static String docfileToBeRead="D:\\workshop\\docs\\softwarerequest.doc";
private static String Dest_Index_Path = "D:\\workshop\\index";
/*================================================================
* 名 称:DocQueryIndex
* 功 能:构造Doc文档检索查询器,对指定的索引进行查询。
===============================================================*/
public static void DocQueryIndex(){
try {
IndexSearcher searcher = new IndexSearcher(Dest_Index_Path); // 生成检索器对象
Term term = new Term("名称","doc"); // 检索关键字
Query query = new TermQuery(term); // 生成检索对象
System.out.println("----------检索内容:"+query.toString()+"----------");
Hits hits = searcher.search(query); // 提交检索
System.out.println("----------检索结果: 共检索到 "+hits.length()+" 条 ----------");
for(int i=0; i < hits.length(); i++) // 获得结果
{
System.out.println(hits.doc(i));
System.out.println(hits.doc(i).getField("id"));
}
}catch (IOException e) {
e.printStackTrace();
}
System.out.println("----------索引检索:PDF索引查询成功----------");
}
/*================================================================
* 名 称:DocIndexBuilder
* 功 能:构造Doc磁盘索引,添加内容到指定目录,为后续检索查询做好准备。
===============================================================*/
public static void DocIndexBuilder(){
try {
//Analyzer TextAnalyzer = new StandardAnalyzer(); // 生成分析器
Analyzer TextAnalyzer = new ChineseAnalyzer();
IndexWriter TextIndex = new IndexWriter(Dest_Index_Path,TextAnalyzer,true); // 生成索引器
TextIndex.setUseCompoundFile(true);
Document document = new Document() ; // 由Office文件生成文档对象
FileInputStream in = new FileInputStream(new File(docfileToBeRead));
HWPFDocument doc = new HWPFDocument(in);
Range range = doc.getRange();
String text = range.text();
Field field_doc = new Field("doc", text,
Field.Store.YES,Field.Index.TOKENIZED);
document.add(field_doc);
System.out.println("----------创建索引:Office 文件内容 ----------");
//System.out.println(document);
TextIndex.addDocument(document); // 添加文档到索引
TextIndex.optimize();
TextIndex.close(); // 索引完毕
}catch (IOException e) {
e.printStackTrace();
}
System.out.println("----------创建索引:Office 文件成功. ----------");
}
public static void GetWordDetail(String filename) throws Exception
{
FileInputStream in = new FileInputStream(new File(filename));
HWPFDocument doc = new HWPFDocument(in);
Range range = doc.getRange();
String text = range.text();
for(int i=0;i < range.numParagraphs();i++ ){
Paragraph p = range.getParagraph(i); //取得每个段落
//组合文字并添加换行
text = p.text();
text.trim();
//text = " <br> " ;
System.out.println( text );
}
}
// 获取doc内纯文本信息
public static void GetWordContent(String filename) throws Exception
{
FileInputStream in = new FileInputStream(new File(filename));
WordExtractor extractor = new WordExtractor(in); // 创建WordExtractor
String text = extractor.getText(); // 对DOC文件进行提取
System.out.println( text );
}
public static void GetWordHWPFDocument(String filename) throws Exception
{
InputStream in = new FileInputStream(new File( "c:\\test.doc ")); //流入doc文档
HWPFDocument wordDocument = new HWPFDocument(in); //通过流得到文档类型
Range range = wordDocument.getRange(); //取得文档篇幅
int total = range.numParagraphs(); //文档内的总段落数
String content = " ";//文章内容
for(int i=0;i <total;i++ ){
Paragraph p = range.getParagraph(i); //取得每个段落
//组合文字并添加换行
content = p.text();
content = " <br> " ;
}
}
// 获取Excel内纯文本信息
public static void GetExcelContent(String filename) throws Exception
{
// 创建对指定Excel工作文件的引用
HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(filename));
HSSFSheet sheet = workbook.getSheetAt(0); // 创建对工作表的引用。
for( int i =0 ; i < workbook.getNumberOfSheets() ; i++ ) // 循环取表单对象
{
System.out.print("########## sheet:--" + i + " --########## " );
sheet = workbook.getSheetAt(i); // 查阅文档的Sheet属性
if( sheet != null )
{
for(int m = 0; m < sheet.getLastRowNum(); m++ ) // 按行循环取行对象
{
HSSFRow row = sheet.getRow(m);
if( row == null){ break;}
System.out.println("");
if(row.getLastCellNum() <= 0) break;
System.out.println( "-----line:--" + m + " ---- ,col num:"
+ row.getLastCellNum());
for(int n = 0; n < row.getLastCellNum(); n++) // 按列循环取单元格对象
{
HSSFCell cell = row.getCell((short)n);
if( cell == null){ break; }
int type = cell.getCellType();
switch(type)
{ case 0:
System.out.print( cell.getNumericCellValue() + " , ");
break;
case 1:
System.out.print(cell.getStringCellValue() + " , ");
break;
case 2:
break;
case 3:
System.out.print( " , ");
break;
default:
System.out.print("未知的单元类型" + type+" , ");
}
}
}
}
System.out.println();
}
}
public static void main(String argv[]){
try{
//GetExcelContent(xlsfileToBeRead);
//GetWordContent(docfileToBeRead);
GetWordDetail(docfileToBeRead);
//DocIndexBuilder();
//DocQueryIndex();
}catch(Exception e) {
System.out.println("运行错误 : " + e );
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -