jaccardmetric.java

来自「wekaUT是 university texas austin 开发的基于wek」· Java 代码 · 共 323 行

JAVA
323
字号
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    JaccardMetric.java *    Copyright (C) 2001 Mikhail Bilenko, Raymond J. Mooney * */package weka.deduping.metrics;import java.util.*;import java.io.Serializable;import weka.core.*;/** * This class claculates  similarity between two strings using the Jaccard metric * Some code borrowed from ir.vsr package by Raymond J. Mooney * * @author MikhailBilenko */public class JaccardMetric extends StringMetric implements DataDependentStringMetric, OptionHandler, Serializable {  /** Strings are mapped to StringReferences in this hash */  protected HashMap m_stringRefHash = null;  /** A HashMap where tokens are indexed. Each indexed token maps   * to a TokenInfo. */  protected HashMap m_tokenHash = null;  /** A list of all indexed strings.  Elements are StringReference's. */  public ArrayList m_stringRefs = null;  /** An underlying tokenizer that is used for converting strings   * into HashMapVectors   */  protected Tokenizer m_tokenizer = new WordTokenizer();    /** We can have different ways of converting from similarity to distance */  public static final int CONVERSION_LAPLACIAN = 1;  public static final int CONVERSION_UNIT = 2;  public static final int CONVERSION_EXPONENTIAL = 4;  public static final Tag[] TAGS_CONVERSION = {    new Tag(CONVERSION_UNIT, "distance = 1-similarity"),    new Tag(CONVERSION_LAPLACIAN, "distance=1/(1+similarity)"),    new Tag(CONVERSION_EXPONENTIAL, "distance=exp(-similarity)")      };  /** The method of converting, by default laplacian */  protected int m_conversionType = CONVERSION_LAPLACIAN;    /** Construct a vector space from a given set of examples   * @param strings a list of strings from which the inverted index is   * to be constructed   */  public JaccardMetric() {    m_stringRefHash = new HashMap();    m_tokenHash = new HashMap();    m_stringRefs = new ArrayList();  }    /** Given a list of strings, build the vector space   */  public void buildMetric(List strings) throws Exception {    m_stringRefHash = new HashMap();    m_tokenHash = new HashMap();    // Loop, processing each of the examples    Iterator stringIterator = strings.iterator();    while (stringIterator.hasNext()) {      String string = (String)stringIterator.next();      // Create a document vector for this document      HashMapVector vector = m_tokenizer.tokenize(string);      vector.initLength();      indexString(string, vector);    }    System.out.println("Indexed " +  m_stringRefs.size() + " documents with " + size() + " unique terms.");  }  /** Index a given string using its corresponding vector */  protected void indexString(String string, HashMapVector vector) {    // Create a new reference    StringReference strRef = new StringReference(string, vector);    m_stringRefs.add(strRef);        m_stringRefHash.put(string, strRef);    // Iterate through each of the tokens in the document    Iterator mapEntries = vector.iterator();    while (mapEntries.hasNext()) {      Map.Entry entry = (Map.Entry)mapEntries.next();      // An entry in the HashMap maps a token to a Weight      String token = (String)entry.getKey();      // The count for the token is in the value of the Weight      int count = (int)((Weight)entry.getValue()).getValue();      // Add an occurence of this token to the inverted index pointing to this document      indexToken(token, count, strRef);    }  }  /** Add a token occurrence to the index.   * @param token The token to index.   * @param count The number of times it occurs in the document.   * @param strRef A reference to the String it occurs in.   */  protected void indexToken(String token, int count, StringReference strRef) {    // Find this token in the index    TokenInfo tokenInfo = (TokenInfo)m_tokenHash.get(token);    if (tokenInfo == null) {      // If this is a new token, create info for it to put in the hashtable      tokenInfo = new TokenInfo();      m_tokenHash.put(token, tokenInfo);    }    // Add a new occurrence for this token to its info    tokenInfo.occList.add(new TokenOccurrence(strRef, count));  }  /** Compute similarity between two strings   * @param s1 first string   * @param s2 second string   * @returns similarity between two strings   */  public double similarity(String s1, String s2) {    StringReference stringRef1 = (StringReference) m_stringRefHash.get(s1);    StringReference stringRef2 = (StringReference) m_stringRefHash.get(s2);    HashMapVector v1 = stringRef1.m_vector;    HashMapVector v2 = stringRef2.m_vector;    int common = 0;    Iterator mapEntries = v1.iterator();    while (mapEntries.hasNext()) {      Map.Entry entry = (Map.Entry)mapEntries.next();      String token = (String)entry.getKey();      if (v2.hashMap.containsKey(token)) {	common++;      }    }    // get the StringRefs for each of the two strings    int l1 = v1.size();    int l2 = v2.size();    double jaccard = 0;    if (l1 + l2 != 0) {      jaccard = (common + 0.0)/(0.0 + Math.max(l1, l2));    }    return jaccard;  }  /** The computation of a metric can be either based on distance, or on similarity   * @returns false because dot product fundamentally computes similarity   */  public boolean isDistanceBased() {    return false;  }    /** Set the tokenizer to use   * @param tokenizer the tokenizer that is used   */  public void setTokenizer(Tokenizer tokenizer) {    m_tokenizer = tokenizer;  }  /** Get the tokenizer to use   * @return the tokenizer that is used   */  public Tokenizer getTokenizer() {    return m_tokenizer;  }  /** Return the number of tokens indexed.   * @return the number of tokens indexed*/  public int size() {    return m_tokenHash.size();  }//=== public double distance1(String s1, String s2) throws Exception {	TreeMap hash = new TreeMap();		StringTokenizer parser = new StringTokenizer(s1, " ,()[]/.;:");	while (parser.hasMoreTokens()) {	    String token = parser.nextToken();	    hash.put(token, token);	}	parser = new StringTokenizer(s2, " ,()[]/.;:");	int common = 0, unique = 0;	while (parser.hasMoreTokens()) {	    String token = parser.nextToken();	    if (hash.containsKey(token)) {		common++;		hash.remove(token);	    } else {		unique++;	    }	}	unique += hash.size();	return (double)unique/((double)unique + common);    }//===    /**   * Returns distance between two strings using the current conversion   * type (CONVERSION_LAPLACIAN, CONVERSION_EXPONENTIAL, CONVERSION_UNIT, ...)   * @param string1 First string.   * @param string2 Second string.   * @exception Exception if distance could not be estimated.   */  public double distance (String string1, String string2) throws Exception {    switch (m_conversionType) {    case CONVERSION_LAPLACIAN:       return 1 / (1 + similarity(string1, string2));    case CONVERSION_UNIT:      return 2 * (1 - similarity(string1, string2));    case CONVERSION_EXPONENTIAL:      return Math.exp(-similarity(string1, string2));    default:      throw new Exception ("Unknown similarity to distance conversion method");    }  }    /**   * Set the type of similarity to distance conversion. Values other   * than CONVERSION_LAPLACIAN, CONVERSION_UNIT, or CONVERSION_EXPONENTIAL will be ignored   *    * @param type type of the similarity to distance conversion to use   */  public void setConversionType(SelectedTag conversionType) {    if (conversionType.getTags() == TAGS_CONVERSION) {      m_conversionType = conversionType.getSelectedTag().getID();    }  }  /**   * return the type of similarity to distance conversion   * @return one of CONVERSION_LAPLACIAN, CONVERSION_UNIT, or CONVERSION_EXPONENTIAL   */  public SelectedTag getConversionType() {    return new SelectedTag(m_conversionType, TAGS_CONVERSION);  }  /** Create a copy of this metric   * @return another JaccardMetric with the same exact parameters as this  metric   */  public Object clone() {    JaccardMetric metric = new JaccardMetric();    metric.setConversionType(new SelectedTag(m_conversionType, TAGS_CONVERSION));    metric.setTokenizer(m_tokenizer);     return metric;  }    /**   * Gets the current settings of NGramTokenizer.   *   * @return an array of strings suitable for passing to setOptions()   */  public String [] getOptions() {    String [] options = new String [20];    int current = 0;    if (m_conversionType == CONVERSION_EXPONENTIAL) {      options[current++] = "-E";    } else if (m_conversionType == CONVERSION_UNIT) {      options[current++] = "-U";    }    options[current++] = "-T";    options[current++] = Utils.removeSubstring(m_tokenizer.getClass().getName(), "weka.deduping.metrics.");    if (m_tokenizer instanceof OptionHandler) {	String[] tokenizerOptions = ((OptionHandler)m_tokenizer).getOptions();	for (int i = 0; i < tokenizerOptions.length; i++) {	  options[current++] = tokenizerOptions[i];	}      }        while (current < options.length) {      options[current++] = "";    }    return options;  }  /**   * Parses a given list of options. Valid options are:<p>   *   * -S use stemming   * -R remove stopwords   * -N gram size   */  public void setOptions(String[] options) throws Exception {    // TODO  }  /**   * Returns an enumeration describing the available options.   *   * @return an enumeration of all the available options.   */  public Enumeration listOptions() {    Vector newVector = new Vector(0);    return newVector.elements();  }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?