📄 synonymmap.java

📁 lucene2.2.0版本
💻 JAVA
字号:
package org.apache.lucene.index.memory;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.IOException;import java.io.InputStream;import java.nio.ByteBuffer;import java.nio.charset.Charset;import java.util.ArrayList;import java.util.Arrays;import java.util.HashMap;import java.util.Iterator;import java.util.Map;import java.util.TreeMap;import java.util.TreeSet;/** * Loads the <a target="_blank"  * href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file <a * href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">wn_s.pl </a> * into a thread-safe main-memory hash map that can be used for fast * high-frequency lookups of synonyms for any given (lowercase) word string. * <p> * There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A). * There does not necessarily hold: A -> B, B -> C then A -> C. * <p> * Loading typically takes some 1.5 secs, so should be done only once per * (server) program execution, using a singleton pattern. Once loaded, a * synonym lookup via {@link #getSynonyms(String)}takes constant time O(1). * A loaded default synonym map consumes about 10 MB main memory. * An instance is immutable, hence thread-safe. * <p> * This implementation borrows some ideas from the Lucene Syns2Index demo that  * Dave Spencer originally contributed to Lucene. Dave's approach * involved a persistent Lucene index which is suitable for occasional * lookups or very large synonym tables, but considered unsuitable for  * high-frequency lookups of medium size synonym tables. * <p> * Example Usage: * <pre> * String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"}; * SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl")); * for (int i = 0; i &lt; words.length; i++) { *     String[] synonyms = map.getSynonyms(words[i]); *     System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString()); * } *  * Example output: * hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough] * woods:[forest, wood] * forest:[afforest, timber, timberland, wood, woodland, woods] * wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike] * xxxx:[] * </pre> *  * @author whoschek.AT.lbl.DOT.gov * @see <a target="_blank" *      href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb *      man page </a> * @see <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site</a> */public class SynonymMap {  /** the index data; Map<String word, String[] synonyms> */  private final HashMap table;    private static final String[] EMPTY = new String[0];    private static final boolean DEBUG = false;  /**   * Constructs an instance, loading WordNet synonym data from the given input   * stream. Finally closes the stream. The words in the stream must be in   * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).   *    * @param input   *            the stream to read from (null indicates an empty synonym map)   * @throws IOException   *             if an error occured while reading the stream.   */  public SynonymMap(InputStream input) throws IOException {    this.table = input == null ? new HashMap(0) : read(toByteArray(input));  }    /**   * Returns the synonym set for the given word, sorted ascending.   *    * @param word   *            the word to lookup (must be in lowercase).   * @return the synonyms; a set of zero or more words, sorted ascending, each   *         word containing lowercase characters that satisfy   *         <code>Character.isLetter()</code>.   */  public String[] getSynonyms(String word) {    Object syns = table.get(word);    if (syns == null) return EMPTY;    if (syns instanceof String) return new String[] {(String) syns};        String[] synonyms = (String[]) syns;    String[] copy = new String[synonyms.length]; // copy for guaranteed immutability    System.arraycopy(synonyms, 0, copy, 0, synonyms.length);    return copy;  }    /**   * Returns a String representation of the index data for debugging purposes.   *    * @return a String representation   */  public String toString() {    StringBuffer buf = new StringBuffer();    Iterator iter = new TreeMap(table).keySet().iterator();    int count = 0;    int f0 = 0;    int f1 = 0;    int f2 = 0;    int f3 = 0;        while (iter.hasNext()) {      String word = (String) iter.next();      buf.append(word + ":");      String[] synonyms = getSynonyms(word);      buf.append(Arrays.asList(synonyms));      buf.append("\n");      count += synonyms.length;      if (synonyms.length == 0) f0++;      if (synonyms.length == 1) f1++;      if (synonyms.length == 2) f2++;      if (synonyms.length == 3) f3++;    }        buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);    return buf.toString();  }    /**   * Analyzes/transforms the given word on input stream loading. This default implementation simply   * lowercases the word. Override this method with a custom stemming   * algorithm or similar, if desired.   *    * @param word   *            the word to analyze   * @return the same word, or a different word (or null to indicate that the   *         word should be ignored)   */  protected String analyze(String word) {    return word.toLowerCase();  }  private static boolean isValid(String str) {    for (int i=str.length(); --i >= 0; ) {      if (!Character.isLetter(str.charAt(i))) return false;    }    return true;  }  private HashMap read(byte[] data) {    int WORDS  = (int) (76401 / 0.7); // presizing    int GROUPS = (int) (88022 / 0.7); // presizing    HashMap word2Groups = new HashMap(WORDS);  // Map<String word, int[] groups>    HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>    HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>    Charset charset = Charset.forName("UTF-8");    int lastNum = -1;    Integer lastGroup = null;    int len = data.length;    int i=0;        while (i < len) { // until EOF      /* Part A: Parse a line */            // scan to beginning of group      while (i < len && data[i] != '(') i++;      if (i >= len) break; // EOF      i++;            // parse group      int num = 0;      while (i < len && data[i] != ',') {        num = 10*num + (data[i] - 48);        i++;      }      i++;//      if (DEBUG) System.err.println("num="+ num);            // scan to beginning of word      while (i < len && data[i] != '\'') i++;      i++;        // scan to end of word      int start = i;      do {        while (i < len && data[i] != '\'') i++;        i++;      } while (i < len && data[i] != ','); // word must end with "',"            if (i >= len) break; // EOF      String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();//      String word = new String(data, 0, start, i-start-1); // ASCII            /*       * Part B: ignore phrases (with spaces and hyphens) and       * non-alphabetic words, and let user customize word (e.g. do some       * stemming)       */      if (!isValid(word)) continue; // ignore      word = analyze(word);      if (word == null || word.length() == 0) continue; // ignore                  /* Part C: Add (group,word) to tables */            // ensure compact string representation, minimizing memory overhead      String w = (String) internedWords.get(word);      if (w == null) {        word = new String(word); // ensure compact string        internedWords.put(word, word);      } else {        word = w;      }            Integer group = lastGroup;      if (num != lastNum) {        group = new Integer(num);        lastGroup = group;        lastNum = num;      }            // add word --> group      ArrayList groups = (ArrayList) word2Groups.get(word);      if (groups == null) {        groups = new ArrayList(1);        word2Groups.put(word, groups);      }      groups.add(group);      // add group --> word      ArrayList words = (ArrayList) group2Words.get(group);      if (words == null) {        words = new ArrayList(1);        group2Words.put(group, words);      }       words.add(word);    }            /* Part D: compute index data structure */    HashMap word2Syns = createIndex(word2Groups, group2Words);                /* Part E: minimize memory consumption by a factor 3 (or so) *///    if (true) return word2Syns;    word2Groups = null; // help gc    group2Words = null; // help gc        return optimize(word2Syns, internedWords);  }    private HashMap createIndex(Map word2Groups, Map group2Words) {    HashMap word2Syns = new HashMap();    Iterator iter = word2Groups.entrySet().iterator();        while (iter.hasNext()) { // for each word      Map.Entry entry = (Map.Entry) iter.next();      ArrayList group = (ArrayList) entry.getValue();           String word = (String) entry.getKey();      //      HashSet synonyms = new HashSet();      TreeSet synonyms = new TreeSet();      for (int i=group.size(); --i >= 0; ) { // for each groupID of word        ArrayList words = (ArrayList) group2Words.get(group.get(i));        for (int j=words.size(); --j >= 0; ) { // add all words                 Object synonym = words.get(j); // note that w and word are interned          if (synonym != word) { // a word is implicitly it's own synonym            synonyms.add(synonym);          }        }      }      int size = synonyms.size();      if (size > 0) {        String[] syns = new String[size];        if (size == 1)            syns[0] = (String) synonyms.first();        else          synonyms.toArray(syns);//        if (syns.length > 1) Arrays.sort(syns);//        if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));        word2Syns.put(word, syns);      }    }      return word2Syns;  }  private HashMap optimize(HashMap word2Syns, HashMap internedWords) {    if (DEBUG) {      System.err.println("before gc");      for (int i=0; i < 10; i++) System.gc();      System.err.println("after gc");    }        // collect entries    int len = 0;    int size = word2Syns.size();    String[][] allSynonyms = new String[size][];    String[] words = new String[size];    Iterator iter = word2Syns.entrySet().iterator();    for (int j=0; j < size; j++) {      Map.Entry entry = (Map.Entry) iter.next();      allSynonyms[j] = (String[]) entry.getValue();       words[j] = (String) entry.getKey();      len += words[j].length();    }        // assemble large string containing all words    StringBuffer buf = new StringBuffer(len);    for (int j=0; j < size; j++) buf.append(words[j]);    String allWords = new String(buf.toString()); // ensure compact string across JDK versions    buf = null;        // intern words at app level via memory-overlaid substrings    for (int p=0, j=0; j < size; j++) {      String word = words[j];      internedWords.put(word, allWords.substring(p, p + word.length()));      p += word.length();    }        // replace words with interned words    for (int j=0; j < size; j++) {      String[] syns = allSynonyms[j];      for (int k=syns.length; --k >= 0; ) {        syns[k] = (String) internedWords.get(syns[k]);      }      Object replacement = syns;      if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more      word2Syns.remove(words[j]);      word2Syns.put(internedWords.get(words[j]), replacement);    }        if (DEBUG) {      words = null;      allSynonyms = null;      internedWords = null;      allWords = null;      System.err.println("before gc");      for (int i=0; i < 10; i++) System.gc();      System.err.println("after gc");    }    return word2Syns;  }    // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux  private static byte[] toByteArray(InputStream input) throws IOException {    try {      // safe and fast even if input.available() behaves weird or buggy      int len = Math.max(256, input.available());      byte[] buffer = new byte[len];      byte[] output = new byte[len];            len = 0;      int n;      while ((n = input.read(buffer)) >= 0) {        if (len + n > output.length) { // grow capacity          byte tmp[] = new byte[Math.max(output.length << 1, len + n)];          System.arraycopy(output, 0, tmp, 0, len);          System.arraycopy(buffer, 0, tmp, len, n);          buffer = output; // use larger buffer for future larger bulk reads          output = tmp;        } else {          System.arraycopy(buffer, 0, output, len, n);        }        len += n;      }      if (len == output.length) return output;      buffer = null; // help gc      buffer = new byte[len];      System.arraycopy(output, 0, buffer, 0, len);      return buffer;    } finally {      if (input != null) input.close();    }  }  }
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -