📄 luceneindexlocaldisk.java
字号:
document.add(field_path);
Field field_type = new Field("filetype","pdf",
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加类型字段
document.add(field_type);
RAMWriter.addDocument(document); // 添加文档到索引
RAMWriter.optimize();
RAMWriter.close(); // 索引完毕
fswriter.addIndexes(new Directory[]{ramdirectory});
}catch (IOException e) {
e.printStackTrace();
}
System.out.println("----------创建索引:PDF 文件成功. ----------");
}
private static void Handledoc (IndexWriter fswriter,File subPath)
{
try {
// 处理分析DOC文档,并索引文档内容
Directory ramdirectory = new RAMDirectory();
Analyzer TextAnalyzer = new StandardAnalyzer(); // 生成分析器
IndexWriter RAMWriter = new IndexWriter(ramdirectory ,TextAnalyzer ,true); // 生成索引器
RAMWriter.setUseCompoundFile(true); // 根据指定文件创建输入流
Document document = new Document() ; // 由Office文件生成文档对象
FileInputStream in = new FileInputStream(subPath);
HWPFDocument doc = new HWPFDocument(in);
Range range = doc.getRange();
String text = range.text();
Field field_doc = new Field("contents", text,
Field.Store.YES,Field.Index.TOKENIZED);
document.add(field_doc);
Field field_name = new Field("filename", subPath.getName(),
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加名字字段
document.add(field_name);
Field field_path = new Field("filepath", subPath.getAbsolutePath(),
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加路径字段
document.add(field_path);
Field field_type = new Field("filetype", "doc",
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加类型字段
document.add(field_type);
System.out.println("----------创建索引:Office 文件内容 ----------");
RAMWriter.addDocument(document); // 添加文档到索引
RAMWriter.optimize();
RAMWriter.close(); // 索引完毕
fswriter.addIndexes(new Directory[]{ramdirectory});
}catch (IOException e) {
e.printStackTrace();
}
System.out.println("----------创建索引:Office 文件成功. ----------");
}
private static void Handlexml(IndexWriter fswriter,File subPath)
{
try { // 处理分析XML文档,并索引文档内容
Directory ramdirectory = new RAMDirectory();
Analyzer TextAnalyzer = new StandardAnalyzer(); // 生成分析器
IndexWriter RAMWriter = new IndexWriter(ramdirectory ,TextAnalyzer ,true); // 生成索引器
RAMWriter.setUseCompoundFile(true); // 根据指定文件创建输入流
Document document = new Document() ; // 由Office文件生成文档对象
XMLExtractor extractor = new XMLExtractor();
String text = extractor.getXMLContent( subPath.getName() );
Field field_doc = new Field("contents", text,
Field.Store.YES,Field.Index.TOKENIZED);
document.add(field_doc);
Field field_name = new Field("filename", subPath.getName(),
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加名字字段
document.add(field_name);
Field field_path = new Field("filepath", subPath.getAbsolutePath(),
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加路径字段
document.add(field_path);
Field field_type = new Field("filetype", "xml",
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加类型字段
document.add(field_type);
System.out.println("----------创建索引:XML 文件内容 ----------");
//System.out.println(document);
RAMWriter.addDocument(document); // 添加文档到索引
RAMWriter.optimize();
RAMWriter.close(); // 索引完毕
fswriter.addIndexes(new Directory[]{ramdirectory});
}catch (IOException e) {
e.printStackTrace();
}
System.out.println("----------创建索引:Office 文件成功. ----------");
}
private static void Handlehtml(IndexWriter fswriter,File subPath) throws IOException
{
System.out.println("handlehtml....");
try{
Directory ramdirectory = new RAMDirectory();
Analyzer TextAnalyzer = new StandardAnalyzer(); // 生成分析器
IndexWriter RAMWriter = new IndexWriter(ramdirectory ,TextAnalyzer ,true); // 生成索引器
RAMWriter.setUseCompoundFile(true); // 根据指定文件创建输入流
Document document = new Document() ; // 由Office文件生成文档对象
Parser parser = new Parser(); // 访问目标网站
parser.setEncoding("GB2312"); // 设置解析编码格式
int len = (int) subPath.length();
char[] buffer = new char[len] ;
FileReader reader = new FileReader(subPath);
reader.read(buffer);
String inputHTML = new String(buffer);
parser.setInputHTML(inputHTML);
TextExtractingVisitor visitor = new TextExtractingVisitor (); // 生成文本内容抽取对象
NodeFilter textFilter = new NodeClassFilter(TextNode.class); // 生成文本过滤器
// new TagNameFilter("body")
// NodeList nodes = parser.extractAllNodesThatMatch( new TagNameFilter("body") ); // 利用文本过滤器解析文档
NodeList nodes = parser.parse(new TagNameFilter("body")); // here
String text =nodes.asString().trim();
if(text.length()<=0){
text=" ";
}
text = text.replace(" ", "");
text = text.replace(">", "");
text = text.replace("<", "");
Field field_doc = new Field("contents", text,
Field.Store.YES,Field.Index.TOKENIZED);
document.add(field_doc);
System.out.println(visitor.getExtractedText()); // 输出网页正文
Field field_name = new Field("filename", subPath.getName(),
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加名字字段
document.add(field_name);
Field field_path = new Field("filepath", subPath.getAbsolutePath(),
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加路径字段
document.add(field_path);
Field field_type = new Field("filetype", "html",
Field.Store.YES,Field.Index.UN_TOKENIZED); // 追加类型字段
document.add(field_type);
//System.out.println(text);
System.out.println("----------创建索引:HTML 文件内容 ----------");
//System.out.println(document);
RAMWriter.addDocument(document); // 添加文档到索引
RAMWriter.optimize();
RAMWriter.close(); // 索引完毕
fswriter.addIndexes(new Directory[]{ramdirectory});
} catch (ParserException e){
e.printStackTrace();
}
}
/*========================================================
* 递归函数,递归分析目录,如果找到子目录,继续递归;如果找到文件分析索引
*========================================================*/
private static void SubindexBuilder(IndexWriter fswriter,File subPath)
throws IOException{
File[] filelist = subPath.listFiles();
System.out.println(subPath.getAbsolutePath() + " :子目录个数 " + filelist.length);
for(int i = 0; i< filelist.length;i++){
File file = filelist[i];
if(file.isDirectory()){
SubindexBuilder(fswriter,file);
} else if(IsValidType(file.getName())){
fileindexBuilder(fswriter,file);
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -