spellchecker.java

来自「一套java版本的搜索引擎源码」· Java 代码 · 共 354 行
JAVA
354 行
package org.apache.lucene.search.spell;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.IOException;import org.apache.lucene.analysis.WhitespaceAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.Term;import org.apache.lucene.search.BooleanClause;import org.apache.lucene.search.BooleanQuery;import org.apache.lucene.search.Hits;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.TermQuery;import org.apache.lucene.store.Directory;import java.util.*;/** * <p> *   Spell Checker class  (Main class) <br/> *  (initially inspired by the David Spencer code). * </p> * * <p>Example Usage: *  * <pre> *  SpellChecker spellchecker = new SpellChecker(spellIndexDirectory); *  // To index a field of a user index: *  spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field)); *  // To index a file containing words: *  spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt"))); *  String[] suggestions = spellchecker.suggestSimilar("misspelt", 5); * </pre> *  * @author Nicolas Maisonneuve * @version 1.0 */public class SpellChecker {  /**   * Field name for each word in the ngram index.   */  public static final String F_WORD = "word";  /**   * the spell index   */  Directory spellIndex;  /**   * Boost value for start and end grams   */  private float bStart = 2.0f;  private float bEnd = 1.0f;  private IndexReader reader;  private IndexSearcher searcher;  // minimum score for hits generated by the spell checker query  private float minScore = 0.5f;  public SpellChecker(Directory spellIndex) throws IOException {    this.setSpellIndex(spellIndex);  }  public void setSpellIndex(Directory spellIndex) throws IOException {    this.spellIndex = spellIndex;    if (!IndexReader.indexExists(spellIndex)) {        IndexWriter writer = new IndexWriter(spellIndex, null, true);        writer.close();    }    searcher = new IndexSearcher(this.spellIndex);  }  /**   * Sets the accuracy 0 &lt; minScore &lt; 1; default 0.5   */  public void setAccuracy(float min) {    this.minScore = min;  }  /**   * Suggest similar words   * @param word String the word you want a spell check done on   * @param numSug int the number of suggest words   * @throws IOException   * @return String[]   */  public String[] suggestSimilar(String word, int numSug) throws IOException {    return this.suggestSimilar(word, numSug, null, null, false);  }  /**   * Suggest similar words (restricted or not to a field of a user index)   * @param word String the word you want a spell check done on   * @param numSug int the number of suggest words   * @param ir the indexReader of the user index (can be null see field param)   * @param field String the field of the user index: if field is not null, the suggested   * words are restricted to the words present in this field.   * @param morePopular boolean return only the suggest words that are more frequent than the searched word   * (only if restricted mode = (indexReader!=null and field!=null)   * @throws IOException   * @return String[] the sorted list of the suggest words with this 2 criteria:   * first criteria: the edit distance, second criteria (only if restricted mode): the popularity   * of the suggest words in the field of the user index   */  public String[] suggestSimilar(String word, int numSug, IndexReader ir,      String field, boolean morePopular) throws IOException {    float min = this.minScore;    final TRStringDistance sd = new TRStringDistance(word);    final int lengthWord = word.length();    final int goalFreq = (morePopular && ir != null) ? ir.docFreq(new Term(field, word)) : 0;    // if the word exists in the real index and we don't care for word frequency, return the word itself    if (!morePopular && goalFreq > 0) {      return new String[] { word };    }    BooleanQuery query = new BooleanQuery();    String[] grams;    String key;    for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {      key = "gram" + ng; // form key      grams = formGrams(word, ng); // form word into ngrams (allow dups too)      if (grams.length == 0) {        continue; // hmm      }      if (bStart > 0) { // should we boost prefixes?        add(query, "start" + ng, grams[0], bStart); // matches start of word      }      if (bEnd > 0) { // should we boost suffixes        add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word      }      for (int i = 0; i < grams.length; i++) {        add(query, key, grams[i]);      }    }//    System.out.println("Q: " + query);    Hits hits = searcher.search(query);//    System.out.println("HITS: " + hits.length());    SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);    // go thru more than 'maxr' matches in case the distance filter triggers    int stop = Math.min(hits.length(), 10 * numSug);    SuggestWord sugWord = new SuggestWord();    for (int i = 0; i < stop; i++) {      sugWord.string = hits.doc(i).get(F_WORD); // get orig word      // don't suggest a word for itself, that would be silly      if (sugWord.string.equals(word)) {        continue;      }      // edit distance/normalize with the minScore word length      sugWord.score = 1.0f - ((float) sd.getDistance(sugWord.string) / Math          .min(sugWord.string.length(), lengthWord));      if (sugWord.score < min) {        continue;      }      if (ir != null) { // use the user index        sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index        // don't suggest a word that is not present in the field        if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) {          continue;        }      }      sugQueue.insert(sugWord);      if (sugQueue.size() == numSug) {        // if queue full, maintain the minScore score        min = ((SuggestWord) sugQueue.top()).score;      }      sugWord = new SuggestWord();    }    // convert to array string    String[] list = new String[sugQueue.size()];    for (int i = sugQueue.size() - 1; i >= 0; i--) {      list[i] = ((SuggestWord) sugQueue.pop()).string;    }    return list;  }  /**   * Add a clause to a boolean query.   */  private static void add(BooleanQuery q, String name, String value, float boost) {    Query tq = new TermQuery(new Term(name, value));    tq.setBoost(boost);    q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));  }  /**   * Add a clause to a boolean query.   */  private static void add(BooleanQuery q, String name, String value) {    q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));  }  /**   * Form all ngrams for a given word.   * @param text the word to parse   * @param ng the ngram length e.g. 3   * @return an array of all ngrams in the word and note that duplicates are not removed   */  private static String[] formGrams(String text, int ng) {    int len = text.length();    String[] res = new String[len - ng + 1];    for (int i = 0; i < len - ng + 1; i++) {      res[i] = text.substring(i, i + ng);    }    return res;  }  public void clearIndex() throws IOException {    IndexReader.unlock(spellIndex);    IndexWriter writer = new IndexWriter(spellIndex, null, true);    writer.close();  }  /**   * Check whether the word exists in the index.   * @param word String   * @throws IOException   * @return true iff the word exists in the index   */  public boolean exist(String word) throws IOException {    if (reader == null) {      reader = IndexReader.open(spellIndex);    }    return reader.docFreq(new Term(F_WORD, word)) > 0;  }  /**   * Index a Dictionary   * @param dict the dictionary to index   * @throws IOException   */  public void indexDictionary(Dictionary dict) throws IOException {    IndexReader.unlock(spellIndex);    IndexWriter writer = new IndexWriter(spellIndex, new WhitespaceAnalyzer(),        !IndexReader.indexExists(spellIndex));    writer.setMergeFactor(300);    writer.setMaxBufferedDocs(150);    Iterator iter = dict.getWordsIterator();    while (iter.hasNext()) {      String word = (String) iter.next();      int len = word.length();      if (len < 3) {        continue; // too short we bail but "too long" is fine...      }      if (this.exist(word)) { // if the word already exist in the gramindex        continue;      }      // ok index the word      Document doc = createDocument(word, getMin(len), getMax(len));      writer.addDocument(doc);    }    // close writer    writer.optimize();    writer.close();  }  private int getMin(int l) {    if (l > 5) {      return 3;    }    if (l == 5) {      return 2;    }    return 1;  }  private int getMax(int l) {    if (l > 5) {      return 4;    }    if (l == 5) {      return 3;    }    return 2;  }  private static Document createDocument(String text, int ng1, int ng2) {    Document doc = new Document();    doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term    addGram(text, doc, ng1, ng2);    return doc;  }  private static void addGram(String text, Document doc, int ng1, int ng2) {    int len = text.length();    for (int ng = ng1; ng <= ng2; ng++) {      String key = "gram" + ng;      String end = null;      for (int i = 0; i < len - ng + 1; i++) {        String gram = text.substring(i, i + ng);        doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));        if (i == 0) {          doc.add(new Field("start" + ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));        }        end = gram;      }      if (end != null) { // may not be present if len==ng1        doc.add(new Field("end" + ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED));      }    }  }  protected void finalize() throws Throwable {    try {      if (reader != null) {        reader.close();      }    } finally {      super.finalize();    }  }}
spellchecker.java - 源码说明

本页面展示了「一套java版本的搜索引擎源码」中的 spellchecker.java 源码文件，采用 Java 编程语言编写，共 354 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与java相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?