📄 indexfiles.java
字号:
package com.briup;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.htmlparser.beans.StringBean;
public class IndexFiles {
public static SimpleDateFormat simpleDateFormat = new SimpleDateFormat(
"yyyy-MM-dd");
public static int IndexFilesContent(String filePath) {
// StringBean 抓取网页得内容
StringBean sb = new StringBean();
// 是否显示web页面的连接(Links)
sb.setLinks(false);
// 设为true表示去掉不规范的空格
sb.setReplaceNonBreakingSpaces(true);
// 如果是true的话把一系列空白字符用一个字符替代,为了取得页面的整洁美观一般设置上面两项为true , 如果要保持页面的原有格式, 如代码页面的空格缩进 可以设置为false
sb.setCollapse(true);
int count = 0;
URLAction urlAction = new URLAction();
try {
IndexWriter writer = null;
File indexDir = new File(filePath);
String files[] = indexDir.list();
boolean gen = false;
boolean cfs = false;
boolean segments = false;
if (files.length > 1) {
segments = true;
}
if (gen || segments || cfs) {
writer = new IndexWriter(filePath, new StandardAnalyzer(),
false);
} else {
writer = new IndexWriter(filePath, new StandardAnalyzer(), true);
}
while (true) {
String url = urlAction.getIndexURL();
if (url.equals(""))
break;
count = count + 1;
try {
sb.setURL(url);
String content = sb.getStrings();
Document doc = new Document();
doc.add(new Field("URL", url, Field.Store.YES,
Field.Index.TOKENIZED));
doc.add(new Field("URLS", url, Field.Store.YES,
Field.Index.UN_TOKENIZED));
if (content.length() > 15) {
doc.add(new Field("title", content.substring(0, 15),
Field.Store.YES, Field.Index.UN_TOKENIZED));
} else {
doc.add(new Field("title", content, Field.Store.YES,
Field.Index.UN_TOKENIZED));
}
doc.add(new Field("contents", content, Field.Store.YES,
Field.Index.TOKENIZED));
doc.add(new Field("crateDate", simpleDateFormat
.format(new Date()), Field.Store.YES,
Field.Index.UN_TOKENIZED));
writer.addDocument(doc);
} catch (Exception e) {
e.printStackTrace();
}
}
writer.optimize();
writer.close();
} catch (IOException ex) {
ex.printStackTrace();
}
return count;
}
public static void updateURL(String filePath, String url) {
deleteFileContent(filePath, url);
StringBean sb = new StringBean();
sb.setLinks(false);
sb.setReplaceNonBreakingSpaces(true);
sb.setCollapse(true);
int count = 0;
try {
IndexWriter writer = null;
File indexDir = new File(filePath);
String files[] = indexDir.list();
boolean gen = false;
boolean cfs = false;
boolean segments = false;
if (files.length > 1) {
segments = true;
}
if (gen || segments || cfs) {
writer = new IndexWriter(filePath, new StandardAnalyzer(),
false);
} else {
writer = new IndexWriter(filePath, new StandardAnalyzer(), true);
}
if (url.equals(""))
return;
try {
sb.setURL(url);
String content = sb.getStrings();
Document doc = new Document();
doc.add(new Field("URL", url, Field.Store.YES,
Field.Index.TOKENIZED));
doc.add(new Field("URLS", url, Field.Store.YES,
Field.Index.UN_TOKENIZED));
if (content.length() > 15) {
doc.add(new Field("title", content.substring(0, 15),
Field.Store.YES, Field.Index.UN_TOKENIZED));
} else {
doc.add(new Field("title", content, Field.Store.YES,
Field.Index.UN_TOKENIZED));
}
doc.add(new Field("contents", content, Field.Store.YES,
Field.Index.TOKENIZED));
doc.add(new Field("crateDate", simpleDateFormat
.format(new Date()), Field.Store.YES,
Field.Index.UN_TOKENIZED));
writer.addDocument(doc);
} catch (Exception e) {
e.printStackTrace();
}
writer.optimize();
writer.close();
} catch (IOException ex) {
ex.printStackTrace();
}
// return count;
}
public static Hits searchIndexContent(String filepath, String queryString) {
Hits hits = null;
try {
IndexReader reader = IndexReader.open(filepath);
Searcher searcher = new IndexSearcher(reader);
String content = "contents";
QueryParser parser = new QueryParser(content,
new StandardAnalyzer());
Query query = parser.parse(queryString);
System.out.println("Searching for: " + query.toString(content));
hits = searcher.search(query);
} catch (Exception e) {
e.printStackTrace();
}
return hits;
}
public static Hits searchIndexURL(String filepath, String queryString) {
Hits hits = null;
try {
IndexReader reader = IndexReader.open(filepath);
Searcher searcher = new IndexSearcher(reader);
String content = "URL";
QueryParser parser = new QueryParser(content,
new StandardAnalyzer());
Query query = parser.parse(queryString);
System.out.println("Searching for: " + query.toString(content));
hits = searcher.search(query);
} catch (Exception e) {
e.printStackTrace();
}
return hits;
}
public static void deleteFileContent(String dir, String url) {
Term aTerm = new Term("URLS", url);
File indexDir = new File(dir);
IndexReader reader = null;
try {
reader = IndexReader.open(indexDir);
int deleted = reader.deleteDocuments(aTerm);
System.out.println("deleted " + deleted + " documents containing "
+ aTerm);
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (reader != null)
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void main(String args[]) {
System.out.println("开始为网站建立索引:");
int count = IndexFilesContent("D:/index");
System.out.println("索引建立完成:共有" + count + "条url");
QueryParser parser = new QueryParser("summary", new StandardAnalyzer());
try {
Query query = parser.parse("briup");
Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(
"<font color=red><B>", "</B></font>"), new QueryScorer(
query));
highlighter.setTextFragmenter(new SimpleFragmenter(60));//
Hits hits = searchIndexContent("d:/index", "briup");
ChineseAnalyzer abc = new ChineseAnalyzer();
TokenStream tokenStream = null;
for (int i = 0; i < hits.length(); i++) {
Document doc = hits.doc(i);
abc = new ChineseAnalyzer();
String text = doc.get("contents");
tokenStream = abc.tokenStream("contents",
new StringReader(text));
String result = text;
try {
result = highlighter.getBestFragments(tokenStream, text, 3,
".....");
} catch (StringIndexOutOfBoundsException e) {
// if(result.length()>100)
// result=result.substring(0,99);
}
// = highlighter.getBestFragments(tokenStream,text,3, ".....");
// http://www.qwserv.com/bbs
System.out.println(result);
System.out.println(doc.get("URL"));
System.out.println("ysesss2");
}
updateURL("C:\\index", "http://www.qwserv.com/bbs");
//deleteFileContent(, );
// IndexReader ir = IndexReader.open("C:\\index");
// ir.undeleteAll();
// ir.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -