📄 luceneindexlocaldisk.java
字号:
package Chapter12;
import java.io.IOException;
import java.io.File;
import java.io.FileReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import java.io.*;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;
import org.apache.poi.hwpf.extractor.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.*;
import org.apache.poi.hwpf.usermodel.*;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.TextExtractingVisitor;
/*******************************************************************
* 本代码完成本地指定目录的遍历和文件查找。对指定后缀的文件进行分析,利用Lucene建立
* 索引,为后续检索使用做好准备。
*******************************************************************/
public class LuceneIndexLocalDisk {
private static String Dest_Index_Path = "D:\\workshop\\alldata3";
//private static String Text_File_Path = "D:\\workshop\\ch12\\012\\";
private static String Text_File_Path = "D:\\科技部项目\\参考文献资料\\";
//private static String Text_File_Path = "C:\\test\\";
/*========================================================
* 主函数,指定索引目录和待分析的目录,生成Lucene索引
*========================================================*/
public static void main(String[] args) {
File indexpath = new File(Dest_Index_Path);
File localPath = new File(Text_File_Path);
try {
int nums = indexBuilder(indexpath,localPath);
System.out.println("Index Finished " + nums + " docs");
} catch (IOException e) {
e.printStackTrace();
}
}
/*========================================================
* 索引创建函数,生成IndexWriter创建索引,调用子目录索引函数,并优化
* 存储本地磁盘索引
*========================================================*/
public static int indexBuilder( File indexPath , File localPath )
throws IOException{
if(!localPath.exists() || !localPath.isDirectory() || !localPath.canRead()){
throw new IOException(localPath + "不存在或者不允许访问" );
}
System.out.println("目标路径完好");
IndexWriter FSWriter = new IndexWriter(indexPath,new StandardAnalyzer(),true);
FSWriter.setUseCompoundFile(true);
SubindexBuilder(FSWriter,localPath);
int num = FSWriter.docCount();
FSWriter.optimize();
FSWriter.close();
return num;
}
/*========================================================
* 判断当前文件名是否符合文件后缀要求
*========================================================*/
private static boolean IsValidType(String name){
if( name.endsWith(".txt") || name.endsWith(".html")
|| name.endsWith(".ini") ||name.endsWith(".conf")
|| name.endsWith(".pdf") ||name.endsWith(".doc"))
{
return true;
} else {
return false;
}
}
/*========================================================
* 处理各种不同类型文档,调用相应的参数,合并到本地磁盘索引当中
*========================================================*/
private static void fileindexBuilder(IndexWriter fswriter,File subfile)
throws IOException{
if( subfile.isHidden() || !subfile.exists() || !subfile.canRead()){
return ;
}
String strname = subfile.getName();
int dotpos = strname.indexOf(".");
if( (dotpos >0) && (dotpos < strname.length()))
{
String ext = strname.substring(dotpos + 1,strname.length());
if( ext.equalsIgnoreCase("pdf") )
Handlepdf(fswriter ,subfile);
else if( ext.equalsIgnoreCase("doc") )
Handledoc(fswriter ,subfile);
else if( ext.equalsIgnoreCase("xml") )
Handlexml(fswriter ,subfile);
else if( ext.equalsIgnoreCase("html") || ext.equalsIgnoreCase("htm") )
Handlehtml(fswriter ,subfile);
else if( IsValidType(strname))
Handletxt(fswriter ,subfile);;
}
}
/*========================================================
* 创建RAM内存索引,生成并添纯文本文档,合并到本地磁盘索引当中
*========================================================*/
private static void Handletxt (IndexWriter fswriter,File subPath)
{
// 处理分析PDF文档,并索引文档内容
try {
Directory ramdirectory = new RAMDirectory();
Analyzer TextAnalyzer = new StandardAnalyzer(); // 生成分析器
IndexWriter RAMWriter = new IndexWriter(ramdirectory ,TextAnalyzer ,true); // 生成索引器
RAMWriter.setUseCompoundFile(true); // 根据指定文件创建输入流
FileInputStream instream = new FileInputStream(subPath);
// 由PDF文件生成文档对象,包含contents字段
Document document = FileDocument.Document(subPath) ;
Field field_name = new Field("filename", subPath.getName(),
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加名字字段
document.add(field_name);
Field field_path = new Field("filepath", subPath.getAbsolutePath(),
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加路径字段
document.add(field_path);
Field field_type = new Field("filetype","txt",
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加类型字段
document.add(field_type);
RAMWriter.addDocument(document); // 添加文档到索引
RAMWriter.optimize();
RAMWriter.close(); // 索引完毕
fswriter.addIndexes(new Directory[]{ramdirectory});
}catch (IOException e) {
e.printStackTrace();
}
System.out.println("----------创建索引:Txt 文件成功. ----------");
}
/*========================================================
* 创建RAM内存索引,生成并添新文档,合并到本地磁盘索引当中
*========================================================*/
private static void Handlepdf (IndexWriter fswriter,File subPath)
{
// 处理分析PDF文档,并索引文档内容
try {
Directory ramdirectory = new RAMDirectory();
Analyzer TextAnalyzer = new StandardAnalyzer(); // 生成分析器
IndexWriter RAMWriter = new IndexWriter(ramdirectory ,TextAnalyzer ,true); // 生成索引器
RAMWriter.setUseCompoundFile(true); // 根据指定文件创建输入流
FileInputStream instream = new FileInputStream(subPath);
System.out.println("" + subPath );
int len = (int) subPath.length() + 1;
byte[] buffer = new byte[ len ] ;
instream.read(buffer);
// 由PDF文件生成文档对象,包含contents字段
//Document document = LucenePDFDocument.getDocument( instream ) ;
Document document = new Document();
Field field_name = new Field("filename", subPath.getName(),
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加名字字段
document.add(field_name);
Field field_path = new Field("filepath", subPath.getAbsolutePath(),
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加路径字段
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -