📄 spellchecker.java
字号:
package org.apache.lucene.search.spell;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.IOException;import org.apache.lucene.analysis.WhitespaceAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.Term;import org.apache.lucene.search.BooleanClause;import org.apache.lucene.search.BooleanQuery;import org.apache.lucene.search.Hits;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.TermQuery;import org.apache.lucene.store.Directory;import java.util.*;/** * <p> * Spell Checker class (Main class) <br/> * (initially inspired by the David Spencer code). * </p> * * <p>Example Usage: * * <pre> * SpellChecker spellchecker = new SpellChecker(spellIndexDirectory); * // To index a field of a user index: * spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field)); * // To index a file containing words: * spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt"))); * String[] suggestions = spellchecker.suggestSimilar("misspelt", 5); * </pre> * * @author Nicolas Maisonneuve * @version 1.0 */public class SpellChecker { /** * Field name for each word in the ngram index. */ public static final String F_WORD = "word"; /** * the spell index */ Directory spellIndex; /** * Boost value for start and end grams */ private float bStart = 2.0f; private float bEnd = 1.0f; private IndexReader reader; private IndexSearcher searcher; // minimum score for hits generated by the spell checker query private float minScore = 0.5f; public SpellChecker(Directory spellIndex) throws IOException { this.setSpellIndex(spellIndex); } public void setSpellIndex(Directory spellIndex) throws IOException { this.spellIndex = spellIndex; if (!IndexReader.indexExists(spellIndex)) { IndexWriter writer = new IndexWriter(spellIndex, null, true); writer.close(); } searcher = new IndexSearcher(this.spellIndex); } /** * Sets the accuracy 0 < minScore < 1; default 0.5 */ public void setAccuracy(float min) { this.minScore = min; } /** * Suggest similar words * @param word String the word you want a spell check done on * @param numSug int the number of suggest words * @throws IOException * @return String[] */ public String[] suggestSimilar(String word, int numSug) throws IOException { return this.suggestSimilar(word, numSug, null, null, false); } /** * Suggest similar words (restricted or not to a field of a user index) * @param word String the word you want a spell check done on * @param numSug int the number of suggest words * @param ir the indexReader of the user index (can be null see field param) * @param field String the field of the user index: if field is not null, the suggested * words are restricted to the words present in this field. * @param morePopular boolean return only the suggest words that are more frequent than the searched word * (only if restricted mode = (indexReader!=null and field!=null) * @throws IOException * @return String[] the sorted list of the suggest words with this 2 criteria: * first criteria: the edit distance, second criteria (only if restricted mode): the popularity * of the suggest words in the field of the user index */ public String[] suggestSimilar(String word, int numSug, IndexReader ir, String field, boolean morePopular) throws IOException { float min = this.minScore; final TRStringDistance sd = new TRStringDistance(word); final int lengthWord = word.length(); final int goalFreq = (morePopular && ir != null) ? ir.docFreq(new Term(field, word)) : 0; // if the word exists in the real index and we don't care for word frequency, return the word itself if (!morePopular && goalFreq > 0) { return new String[] { word }; } BooleanQuery query = new BooleanQuery(); String[] grams; String key; for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) { key = "gram" + ng; // form key grams = formGrams(word, ng); // form word into ngrams (allow dups too) if (grams.length == 0) { continue; // hmm } if (bStart > 0) { // should we boost prefixes? add(query, "start" + ng, grams[0], bStart); // matches start of word } if (bEnd > 0) { // should we boost suffixes add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word } for (int i = 0; i < grams.length; i++) { add(query, key, grams[i]); } }// System.out.println("Q: " + query); Hits hits = searcher.search(query);// System.out.println("HITS: " + hits.length()); SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); // go thru more than 'maxr' matches in case the distance filter triggers int stop = Math.min(hits.length(), 10 * numSug); SuggestWord sugWord = new SuggestWord(); for (int i = 0; i < stop; i++) { sugWord.string = hits.doc(i).get(F_WORD); // get orig word // don't suggest a word for itself, that would be silly if (sugWord.string.equals(word)) { continue; } // edit distance/normalize with the minScore word length sugWord.score = 1.0f - ((float) sd.getDistance(sugWord.string) / Math .min(sugWord.string.length(), lengthWord)); if (sugWord.score < min) { continue; } if (ir != null) { // use the user index sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index // don't suggest a word that is not present in the field if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) { continue; } } sugQueue.insert(sugWord); if (sugQueue.size() == numSug) { // if queue full, maintain the minScore score min = ((SuggestWord) sugQueue.top()).score; } sugWord = new SuggestWord(); } // convert to array string String[] list = new String[sugQueue.size()]; for (int i = sugQueue.size() - 1; i >= 0; i--) { list[i] = ((SuggestWord) sugQueue.pop()).string; } return list; } /** * Add a clause to a boolean query. */ private static void add(BooleanQuery q, String name, String value, float boost) { Query tq = new TermQuery(new Term(name, value)); tq.setBoost(boost); q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD)); } /** * Add a clause to a boolean query. */ private static void add(BooleanQuery q, String name, String value) { q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD)); } /** * Form all ngrams for a given word. * @param text the word to parse * @param ng the ngram length e.g. 3 * @return an array of all ngrams in the word and note that duplicates are not removed */ private static String[] formGrams(String text, int ng) { int len = text.length(); String[] res = new String[len - ng + 1]; for (int i = 0; i < len - ng + 1; i++) { res[i] = text.substring(i, i + ng); } return res; } public void clearIndex() throws IOException { IndexReader.unlock(spellIndex); IndexWriter writer = new IndexWriter(spellIndex, null, true); writer.close(); } /** * Check whether the word exists in the index. * @param word String * @throws IOException * @return true iff the word exists in the index */ public boolean exist(String word) throws IOException { if (reader == null) { reader = IndexReader.open(spellIndex); } return reader.docFreq(new Term(F_WORD, word)) > 0; } /** * Index a Dictionary * @param dict the dictionary to index * @throws IOException */ public void indexDictionary(Dictionary dict) throws IOException { IndexReader.unlock(spellIndex); IndexWriter writer = new IndexWriter(spellIndex, new WhitespaceAnalyzer(), !IndexReader.indexExists(spellIndex)); writer.setMergeFactor(300); writer.setMaxBufferedDocs(150); Iterator iter = dict.getWordsIterator(); while (iter.hasNext()) { String word = (String) iter.next(); int len = word.length(); if (len < 3) { continue; // too short we bail but "too long" is fine... } if (this.exist(word)) { // if the word already exist in the gramindex continue; } // ok index the word Document doc = createDocument(word, getMin(len), getMax(len)); writer.addDocument(doc); } // close writer writer.optimize(); writer.close(); } private int getMin(int l) { if (l > 5) { return 3; } if (l == 5) { return 2; } return 1; } private int getMax(int l) { if (l > 5) { return 4; } if (l == 5) { return 3; } return 2; } private static Document createDocument(String text, int ng1, int ng2) { Document doc = new Document(); doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term addGram(text, doc, ng1, ng2); return doc; } private static void addGram(String text, Document doc, int ng1, int ng2) { int len = text.length(); for (int ng = ng1; ng <= ng2; ng++) { String key = "gram" + ng; String end = null; for (int i = 0; i < len - ng + 1; i++) { String gram = text.substring(i, i + ng); doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED)); if (i == 0) { doc.add(new Field("start" + ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED)); } end = gram; } if (end != null) { // may not be present if len==ng1 doc.add(new Field("end" + ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED)); } } } protected void finalize() throws Throwable { try { if (reader != null) { reader.close(); } } finally { super.finalize(); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -