📄 myindexer.java
字号:
package com.redmoon.forum.search;
/**
* Create a Lucene index.
*/
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
public class MyIndexer {
public static final String documentBaseDir = "../my_lucene_app/documents";
public static final String indexStorageDir = "../index";
static Pattern companyNamePattern = Pattern.compile(
"\\s+COMPANY CONFORMED NAME:\\s+(.+)");
static Pattern formTypePattern = Pattern.compile("\\s+FORM TYPE:\\s+(.+)");
static Pattern filedDatePattern = Pattern.compile(
"FILED AS OF DATE:\\s+(.+)");
/**
* main method gets invoked when run from the command line.
*
* @param args array of command line arguments.
*/
public static void main(String[] args) throws Exception {
MyIndexer indexer = new MyIndexer();
Analyzer analyzer = new StandardAnalyzer();
boolean createNewIndex = true;
IndexWriter indexWriter = new IndexWriter(indexStorageDir, analyzer,
createNewIndex);
indexer.indexDirectory(indexWriter, new File(documentBaseDir));
indexWriter.optimize();
indexWriter.close();
}
/**
* Recursively descend directories. When this method encounters a
* file, invoke indexFile method.
*
* @param indexWriter The Lucene IndexWriter instance tied to the
* index we are creating.
* @param dir the directory we wish to index.
* @exception IOException if there is an I/O error either recursing
* into the tree or reading a file.
*/
public void indexDirectory(IndexWriter indexWriter, File dir) throws
IOException {
File[] fileArray = dir.listFiles();
for (int i = 0; i < fileArray.length; i++) {
File file = fileArray[i];
if (file.isDirectory()) {
indexDirectory(indexWriter, file);
} else {
indexFile(indexWriter, file);
}
}
}
/**
* Add a file to the Lucene index.
*
* @param indexWriter The Lucene IndexWriter instance tied to the
* index we are creating.
* @param file the file we wish to index.
* @exception IOException if there is an I/O error reading a file.
*/
public void indexFile(IndexWriter indexWriter, File file) throws
IOException {
String filename = file.getPath().replaceAll("^" + documentBaseDir + "/",
"");
Document document = new Document();
// document.add(Field.UnIndexed("filename", filename));
FileReader fileReader = new FileReader(file);
StringBuffer stringBuffer = new StringBuffer();
char[] charArray = new char[1048576];
int numCharsRead = 0;
while ((numCharsRead = fileReader.read(charArray, 0, 1048576)) > 0) {
stringBuffer.append(charArray, 0, numCharsRead);
}
fileReader.close();
String textContent = stringBuffer.toString();
/* parse company name. */
Matcher companyNameMatcher = companyNamePattern.matcher(textContent);
companyNameMatcher.find();
String companyName = companyNameMatcher.group(1);
// document.add(Field.Text("companyName", companyName));
/* store format. */
String format = "text";
// document.add(Field.Keyword("format", format));
/* parse form type. */
Matcher formTypeMatcher = formTypePattern.matcher(textContent);
formTypeMatcher.find();
String formType = formTypeMatcher.group(1);
// document.add(Field.Keyword("formType", formType));
/* parse filed date. */
Matcher filedDateMatcher = filedDatePattern.matcher(textContent);
filedDateMatcher.find();
String filedDate = filedDateMatcher.group(1);
// document.add(Field.Keyword("filedDate", filedDate));
/* index full text content, but do not store in index. */
String plainText = textContent;
// document.add(Field.UnStored("content", plainText));
/* add the document to the index. */
indexWriter.addDocument(document);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -