luceneindexlocaldisk.java

来自「Lucene+nuctch一书的全部源码测试源码和几个简单的项目」· Java 代码 · 共 386 行 · 第 1/2 页
JAVA
386 行
package Chapter12;

import java.io.IOException;
import java.io.File;
import java.io.FileReader;

import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import java.io.*;

import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;

import org.apache.poi.hwpf.extractor.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.*;
import org.apache.poi.hwpf.usermodel.*;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.TextExtractingVisitor;

/*******************************************************************
 * 本代码完成本地指定目录的遍历和文件查找。对指定后缀的文件进行分析，利用Lucene建立
 * 索引，为后续检索使用做好准备。
 *******************************************************************/
public class LuceneIndexLocalDisk {

	private static String Dest_Index_Path = "D:\\workshop\\alldata3";
	//private static String Text_File_Path  = "D:\\workshop\\ch12\\012\\";
	private static String Text_File_Path  = "D:\\科技部项目\\参考文献资料\\";
	//private static String Text_File_Path  = "C:\\test\\";
	
	/*========================================================
	 * 主函数，指定索引目录和待分析的目录，生成Lucene索引
	 *========================================================*/
	public static void main(String[] args) {
		
		File indexpath = new File(Dest_Index_Path);
		File localPath = new File(Text_File_Path);
		
		try {
			int nums = indexBuilder(indexpath,localPath);
			System.out.println("Index Finished " + nums + "  docs");			
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	/*========================================================
	 * 索引创建函数，生成IndexWriter创建索引，调用子目录索引函数，并优化
	 * 存储本地磁盘索引
	 *========================================================*/
	public static int indexBuilder( File indexPath , File localPath ) 
	throws IOException{
		if(!localPath.exists() || !localPath.isDirectory() || !localPath.canRead()){
			throw new IOException(localPath + "不存在或者不允许访问" );
		}
		System.out.println("目标路径完好");		
		IndexWriter FSWriter = new IndexWriter(indexPath,new StandardAnalyzer(),true);
		FSWriter.setUseCompoundFile(true);

		SubindexBuilder(FSWriter,localPath);
		int num =  FSWriter.docCount();
		FSWriter.optimize();
		FSWriter.close();
		return num;
	}
	
	/*========================================================
	 * 判断当前文件名是否符合文件后缀要求
	 *========================================================*/
	private static boolean IsValidType(String name){
			if(    name.endsWith(".txt") || name.endsWith(".html")
				|| name.endsWith(".ini") ||name.endsWith(".conf")
				|| name.endsWith(".pdf") ||name.endsWith(".doc"))
			{
				return true;
			} else {
				return false;
			}
	}
	/*========================================================
	 * 处理各种不同类型文档,调用相应的参数，合并到本地磁盘索引当中
	 *========================================================*/
	private static void  fileindexBuilder(IndexWriter fswriter,File subfile)  
	throws IOException{
	
		if( subfile.isHidden() || !subfile.exists() || !subfile.canRead()){
			return ;
		}
        String strname = subfile.getName();
        int dotpos = strname.indexOf(".");
         if( (dotpos >0) && (dotpos < strname.length()))
         {
            String ext = strname.substring(dotpos + 1,strname.length());
            if( ext.equalsIgnoreCase("pdf") )
                 Handlepdf(fswriter ,subfile);
            else if( ext.equalsIgnoreCase("doc") )
                 Handledoc(fswriter ,subfile);
            else if( ext.equalsIgnoreCase("xml") )
                 Handlexml(fswriter ,subfile);
            else if( ext.equalsIgnoreCase("html") || ext.equalsIgnoreCase("htm") )
                Handlehtml(fswriter ,subfile);
            else if( IsValidType(strname))
            	Handletxt(fswriter ,subfile);;
         }
	}
	/*========================================================
	 * 创建RAM内存索引，生成并添纯文本文档，合并到本地磁盘索引当中
	 *========================================================*/	
	private static void  Handletxt (IndexWriter fswriter,File subPath)
	{

		// 处理分析PDF文档，并索引文档内容
		try {
			Directory ramdirectory = new RAMDirectory();
			Analyzer TextAnalyzer = new StandardAnalyzer();             // 生成分析器
			IndexWriter RAMWriter = new IndexWriter(ramdirectory ,TextAnalyzer ,true); // 生成索引器
			
			RAMWriter.setUseCompoundFile(true);                        // 根据指定文件创建输入流

			FileInputStream instream = new FileInputStream(subPath); 
		
			// 由PDF文件生成文档对象，包含contents字段
			Document document = FileDocument.Document(subPath) ; 

			Field field_name = new Field("filename", subPath.getName(),   
					Field.Store.YES,Field.Index.UN_TOKENIZED);         // 追加名字字段
			document.add(field_name);

			Field field_path = new Field("filepath", subPath.getAbsolutePath(), 
					Field.Store.YES,Field.Index.UN_TOKENIZED);         // 追加路径字段
			document.add(field_path);
			
			Field field_type = new Field("filetype","txt",   
					Field.Store.YES,Field.Index.UN_TOKENIZED);         // 追加类型字段
			document.add(field_type);
			
			RAMWriter.addDocument(document);                          // 添加文档到索引
			RAMWriter.optimize();
			RAMWriter.close();                                        // 索引完毕
			fswriter.addIndexes(new Directory[]{ramdirectory});
			
		  }catch (IOException e) {
			e.printStackTrace();
		}
		System.out.println("----------创建索引：Txt 文件成功. ----------");
}			
	
	/*========================================================
	 * 创建RAM内存索引，生成并添新文档，合并到本地磁盘索引当中
	 *========================================================*/
	private static void  Handlepdf (IndexWriter fswriter,File subPath)
	{
			// 处理分析PDF文档，并索引文档内容
			try {
				Directory ramdirectory = new RAMDirectory();
				Analyzer TextAnalyzer = new StandardAnalyzer();            // 生成分析器
				IndexWriter RAMWriter = new IndexWriter(ramdirectory ,TextAnalyzer ,true); // 生成索引器
				
				RAMWriter.setUseCompoundFile(true);                        // 根据指定文件创建输入流

				FileInputStream instream = new FileInputStream(subPath); 
			
				System.out.println("" + subPath );
				
				int len = (int) subPath.length() + 1;
				byte[] buffer = new byte[ len ] ;
				instream.read(buffer);
				
				
				// 由PDF文件生成文档对象，包含contents字段
				//Document document = LucenePDFDocument.getDocument( instream ) ;
				Document document = new Document();
				Field field_name = new Field("filename", subPath.getName(),   
						Field.Store.YES,Field.Index.UN_TOKENIZED);         // 追加名字字段
				document.add(field_name);
				
				Field field_path = new Field("filepath", subPath.getAbsolutePath(), 
						Field.Store.YES,Field.Index.UN_TOKENIZED);         // 追加路径字段
luceneindexlocaldisk.java - 源码说明

本页面展示了「Lucene+nuctch一书的全部源码测试源码和几个简单的项目」中的 luceneindexlocaldisk.java 源码文件，采用 Java 编程语言编写，共 386 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Lucene相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?