📄 indexfiles.java

📁 本系统是一个跨平台的搜索引擎
💻 JAVA
字号:
package com.briup;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.htmlparser.beans.StringBean;

public class IndexFiles {
	public static SimpleDateFormat simpleDateFormat = new SimpleDateFormat(
			"yyyy-MM-dd");

	public static int IndexFilesContent(String filePath) {
		// StringBean 抓取网页得内容
		StringBean sb = new StringBean();
		// 是否显示web页面的连接(Links)
		sb.setLinks(false);
		// 设为true表示去掉不规范的空格
		sb.setReplaceNonBreakingSpaces(true);
		// 如果是true的话把一系列空白字符用一个字符替代,为了取得页面的整洁美观一般设置上面两项为true , 如果要保持页面的原有格式, 如代码页面的空格缩进 可以设置为false
		sb.setCollapse(true);
		int count = 0;
		URLAction urlAction = new URLAction();
		try {
			IndexWriter writer = null;
			File indexDir = new File(filePath);
			String files[] = indexDir.list();
			boolean gen = false;
			boolean cfs = false;
			boolean segments = false;
			if (files.length > 1) {
				segments = true;
			}
			if (gen || segments || cfs) {
				writer = new IndexWriter(filePath, new StandardAnalyzer(),
						false);
			} else {
				writer = new IndexWriter(filePath, new StandardAnalyzer(), true);
			}
			while (true) {
				String url = urlAction.getIndexURL();
				if (url.equals(""))
					break;
				count = count + 1;
				try {
					sb.setURL(url);
					String content = sb.getStrings();
					Document doc = new Document();
					doc.add(new Field("URL", url, Field.Store.YES,
							Field.Index.TOKENIZED));
					doc.add(new Field("URLS", url, Field.Store.YES,
							Field.Index.UN_TOKENIZED));
					if (content.length() > 15) {
						doc.add(new Field("title", content.substring(0, 15),
								Field.Store.YES, Field.Index.UN_TOKENIZED));
					} else {
						doc.add(new Field("title", content, Field.Store.YES,
								Field.Index.UN_TOKENIZED));
					}
					doc.add(new Field("contents", content, Field.Store.YES,
							Field.Index.TOKENIZED));
					doc.add(new Field("crateDate", simpleDateFormat
							.format(new Date()), Field.Store.YES,
							Field.Index.UN_TOKENIZED));
					writer.addDocument(doc);
				} catch (Exception e) {
					e.printStackTrace();
				}
			}
			writer.optimize();
			writer.close();
		} catch (IOException ex) {
			ex.printStackTrace();
		}
		return count;
	}

	public static void updateURL(String filePath, String url) {
		deleteFileContent(filePath, url);
		StringBean sb = new StringBean();
		sb.setLinks(false);
		sb.setReplaceNonBreakingSpaces(true);
		sb.setCollapse(true);
		int count = 0;
		try {
			IndexWriter writer = null;
			File indexDir = new File(filePath);
			String files[] = indexDir.list();
			boolean gen = false;
			boolean cfs = false;
			boolean segments = false;
			if (files.length > 1) {
				segments = true;
			}
			if (gen || segments || cfs) {
				writer = new IndexWriter(filePath, new StandardAnalyzer(),
						false);
			} else {
				writer = new IndexWriter(filePath, new StandardAnalyzer(), true);
			}
			if (url.equals(""))
				return;
			try {
				sb.setURL(url);
				String content = sb.getStrings();
				Document doc = new Document();
				doc.add(new Field("URL", url, Field.Store.YES,
						Field.Index.TOKENIZED));
				doc.add(new Field("URLS", url, Field.Store.YES,
						Field.Index.UN_TOKENIZED));
				if (content.length() > 15) {
					doc.add(new Field("title", content.substring(0, 15),
							Field.Store.YES, Field.Index.UN_TOKENIZED));
				} else {
					doc.add(new Field("title", content, Field.Store.YES,
							Field.Index.UN_TOKENIZED));
				}
				doc.add(new Field("contents", content, Field.Store.YES,
						Field.Index.TOKENIZED));
				doc.add(new Field("crateDate", simpleDateFormat
						.format(new Date()), Field.Store.YES,
						Field.Index.UN_TOKENIZED));
				writer.addDocument(doc);
			} catch (Exception e) {
				e.printStackTrace();
			}
			writer.optimize();
			writer.close();
		} catch (IOException ex) {
			ex.printStackTrace();
		}
		// return count;
	}

	public static Hits searchIndexContent(String filepath, String queryString) {
		Hits hits = null;
		try {
			IndexReader reader = IndexReader.open(filepath);
			Searcher searcher = new IndexSearcher(reader);
			String content = "contents";
			QueryParser parser = new QueryParser(content,
					new StandardAnalyzer());
			Query query = parser.parse(queryString);
			System.out.println("Searching for: " + query.toString(content));
			hits = searcher.search(query);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return hits;
	}

	public static Hits searchIndexURL(String filepath, String queryString) {
		Hits hits = null;
		try {
			IndexReader reader = IndexReader.open(filepath);
			Searcher searcher = new IndexSearcher(reader);
			String content = "URL";
			QueryParser parser = new QueryParser(content,
					new StandardAnalyzer());
			Query query = parser.parse(queryString);
			System.out.println("Searching for: " + query.toString(content));
			hits = searcher.search(query);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return hits;
	}

	public static void deleteFileContent(String dir, String url) {
		Term aTerm = new Term("URLS", url);
		File indexDir = new File(dir);
		IndexReader reader = null;
		try {
			reader = IndexReader.open(indexDir);
			int deleted = reader.deleteDocuments(aTerm);
			System.out.println("deleted " + deleted + " documents containing "
					+ aTerm);
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (reader != null)
					reader.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}

	public static void main(String args[]) {
		System.out.println("开始为网站建立索引：");
		int count = IndexFilesContent("D:/index");
		System.out.println("索引建立完成：共有" + count + "条url");

		QueryParser parser = new QueryParser("summary", new StandardAnalyzer());
		try {
			Query query = parser.parse("briup");
			Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(
					"<font color=red><B>", "</B></font>"), new QueryScorer(
					query));
			highlighter.setTextFragmenter(new SimpleFragmenter(60));//
			Hits hits = searchIndexContent("d:/index", "briup");
			ChineseAnalyzer abc = new ChineseAnalyzer();
			TokenStream tokenStream = null;
			for (int i = 0; i < hits.length(); i++) {
				Document doc = hits.doc(i);
				abc = new ChineseAnalyzer();
				String text = doc.get("contents");
				tokenStream = abc.tokenStream("contents",
						new StringReader(text));
				String result = text;
				try {
					result = highlighter.getBestFragments(tokenStream, text, 3,
							".....");
				} catch (StringIndexOutOfBoundsException e) {
					// if(result.length()>100)
					// result=result.substring(0,99);
				}
				// = highlighter.getBestFragments(tokenStream,text,3, ".....");
				// http://www.qwserv.com/bbs
				System.out.println(result);
				System.out.println(doc.get("URL"));
				System.out.println("ysesss2");
			}
			updateURL("C:\\index", "http://www.qwserv.com/bbs");
			//deleteFileContent(, );
			// IndexReader ir = IndexReader.open("C:\\index");
			// ir.undeleteAll();
			// ir.close();

		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}
💿 文件大小 2178 K
👤 上传用户 laosoler
📂 所属分类 Java编程
🏷️ 相关标签

#搜索引擎 #跨平台
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -