📄 wordlistloader.java

📁 Lucene a java open-source SearchEngine Framework

💻 JAVA

字号:

package org.apache.lucene.analysis;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.BufferedReader;import java.io.File;import java.io.FileReader;import java.io.IOException;import java.io.Reader;import java.util.HashMap;import java.util.HashSet;/** * Loader for text files that represent a list of stopwords. * * * @version $Id: WordlistLoader.java 564236 2007-08-09 15:21:19Z gsingers $ */public class WordlistLoader {  /**   * Loads a text file and adds every line as an entry to a HashSet (omitting   * leading and trailing whitespace). Every line of the file should contain only   * one word. The words need to be in lowercase if you make use of an   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).   *   * @param wordfile File containing the wordlist   * @return A HashSet with the file's words   */  public static HashSet getWordSet(File wordfile) throws IOException {    HashSet result = new HashSet();    FileReader reader = null;    try {      reader = new FileReader(wordfile);      result = getWordSet(reader);    }    finally {      if (reader != null)        reader.close();    }    return result;  }  /**   * Reads lines from a Reader and adds every line as an entry to a HashSet (omitting   * leading and trailing whitespace). Every line of the Reader should contain only   * one word. The words need to be in lowercase if you make use of an   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).   *   * @param reader Reader containing the wordlist   * @return A HashSet with the reader's words   */  public static HashSet getWordSet(Reader reader) throws IOException {    HashSet result = new HashSet();    BufferedReader br = null;    try {      if (reader instanceof BufferedReader) {        br = (BufferedReader) reader;      } else {        br = new BufferedReader(reader);      }      String word = null;      while ((word = br.readLine()) != null) {        result.add(word.trim());      }    }    finally {      if (br != null)        br.close();    }    return result;  }  /**   * Reads a stem dictionary. Each line contains:   * <pre>word<b>\t</b>stem</pre>   * (i.e. two tab seperated words)   *   * @return stem dictionary that overrules the stemming algorithm   * @throws IOException    */  public static HashMap getStemDict(File wordstemfile) throws IOException {    if (wordstemfile == null)      throw new NullPointerException("wordstemfile may not be null");    HashMap result = new HashMap();    BufferedReader br = null;    FileReader fr = null;    try {      fr = new FileReader(wordstemfile);      br = new BufferedReader(fr);      String line;      while ((line = br.readLine()) != null) {        String[] wordstem = line.split("\t", 2);        result.put(wordstem[0], wordstem[1]);      }    } finally {      if (fr != null)        fr.close();      if (br != null)        br.close();    }    return result;  }}

💿 文件大小 5390 K

👤 上传用户 rickie936

📂 所属分类 Java编程

🏷️ 相关标签

#SearchEngine #open-source #Framework #Lucene

⌨️ 快捷键说明

复制代码 Ctrl + C

搜索代码 Ctrl + F

全屏模式 F11

切换主题 Ctrl + Shift + D

显示快捷键 ?

增大字号 Ctrl + =

减小字号 Ctrl + -