⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 myindexer.java

📁 源码/软件简介: 云网论坛1.1RC国际版是采用JSP开发的集论坛、CMS(网站内容管理系统)、博客、聊天室、商城、交友、语音灌水等于一体的门户式社区。拥有CWBBS ( Cloud Web BBS
💻 JAVA
字号:
package com.redmoon.forum.search;

/**
 * Create a Lucene index.
 */
import java.io.File;
import java.io.FileReader;
import java.io.IOException;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;

public class MyIndexer {
    public static final String documentBaseDir = "../my_lucene_app/documents";
    public static final String indexStorageDir = "../index";

    static Pattern companyNamePattern = Pattern.compile(
            "\\s+COMPANY CONFORMED NAME:\\s+(.+)");
    static Pattern formTypePattern = Pattern.compile("\\s+FORM TYPE:\\s+(.+)");
    static Pattern filedDatePattern = Pattern.compile(
            "FILED AS OF DATE:\\s+(.+)");

    /**
     * main method gets invoked when run from the command line.
     *
     * @param args array of command line arguments.
     */
    public static void main(String[] args) throws Exception {
        MyIndexer indexer = new MyIndexer();
        Analyzer analyzer = new StandardAnalyzer();
        boolean createNewIndex = true;
        IndexWriter indexWriter = new IndexWriter(indexStorageDir, analyzer,
                                                  createNewIndex);
        indexer.indexDirectory(indexWriter, new File(documentBaseDir));
        indexWriter.optimize();
        indexWriter.close();
    }

    /**
     * Recursively descend directories. When this method encounters a
     * file, invoke indexFile method.
     *
     * @param indexWriter The Lucene IndexWriter instance tied to the
     * index we are creating.
     * @param dir the directory we wish to index.
     * @exception IOException if there is an I/O error either recursing
     * into the tree or reading a file.
     */
    public void indexDirectory(IndexWriter indexWriter, File dir) throws
            IOException {
        File[] fileArray = dir.listFiles();
        for (int i = 0; i < fileArray.length; i++) {
            File file = fileArray[i];
            if (file.isDirectory()) {
                indexDirectory(indexWriter, file);
            } else {
                indexFile(indexWriter, file);
            }
        }
    }

    /**
     * Add a file to the Lucene index.
     *
     * @param indexWriter The Lucene IndexWriter instance tied to the
     * index we are creating.
     * @param file the file we wish to index.
     * @exception IOException if there is an I/O error reading a file.
     */
    public void indexFile(IndexWriter indexWriter, File file) throws
            IOException {
        String filename = file.getPath().replaceAll("^" + documentBaseDir + "/",
                "");
        Document document = new Document();
        // document.add(Field.UnIndexed("filename", filename));
        FileReader fileReader = new FileReader(file);
        StringBuffer stringBuffer = new StringBuffer();
        char[] charArray = new char[1048576];
        int numCharsRead = 0;
        while ((numCharsRead = fileReader.read(charArray, 0, 1048576)) > 0) {
            stringBuffer.append(charArray, 0, numCharsRead);
        }
        fileReader.close();
        String textContent = stringBuffer.toString();
        /* parse company name. */
        Matcher companyNameMatcher = companyNamePattern.matcher(textContent);
        companyNameMatcher.find();
        String companyName = companyNameMatcher.group(1);
        // document.add(Field.Text("companyName", companyName));
        /* store format. */
        String format = "text";
        // document.add(Field.Keyword("format", format));
        /* parse form type. */
        Matcher formTypeMatcher = formTypePattern.matcher(textContent);
        formTypeMatcher.find();
        String formType = formTypeMatcher.group(1);
        // document.add(Field.Keyword("formType", formType));
        /* parse filed date. */
        Matcher filedDateMatcher = filedDatePattern.matcher(textContent);
        filedDateMatcher.find();
        String filedDate = filedDateMatcher.group(1);
        // document.add(Field.Keyword("filedDate", filedDate));
        /* index full text content, but do not store in index. */
        String plainText = textContent;
        // document.add(Field.UnStored("content", plainText));
        /* add the document to the index. */
        indexWriter.addDocument(document);
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -