⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vectorspacemetric.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
字号:
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    VectorSpaceMetric.java *    Copyright (C) 2001 Mikhail Bilenko, Raymond J. Mooney * */package weka.deduping.metrics;import java.util.*;import java.io.Serializable;import weka.core.*;/** * This class uses a vector space to calculate similarity between two strings * Some code borrowed from ir.vsr package by Raymond J. Mooney * * @author MikhailBilenko */public class VectorSpaceMetric extends StringMetric implements DataDependentStringMetric,							       OptionHandler, Serializable {  /** Strings are mapped to StringReferences in this hash */  protected HashMap m_stringRefHash = null;  /** A HashMap where tokens are indexed. Each indexed token maps   * to a TokenInfo. */  protected HashMap m_tokenHash = null;  /** A list of all indexed strings.  Elements are StringReference's. */  public ArrayList m_stringRefs = null;  /** An underlying tokenizer that is used for converting strings   * into HashMapVectors   */  protected Tokenizer m_tokenizer = new WordTokenizer();  /** Should IDF weighting be used? */  protected boolean m_useIDF = true;  /** We can have different ways of converting from similarity to distance */  public static final int CONVERSION_LAPLACIAN = 1;  public static final int CONVERSION_UNIT = 2;  public static final int CONVERSION_EXPONENTIAL = 4;  public static final Tag[] TAGS_CONVERSION = {    new Tag(CONVERSION_UNIT, "distance = 1-similarity"),    new Tag(CONVERSION_LAPLACIAN, "distance=1/(1+similarity)"),    new Tag(CONVERSION_EXPONENTIAL, "distance=exp(-similarity)")      };  /** The method of converting, by default laplacian */  protected int m_conversionType = CONVERSION_EXPONENTIAL;    /** Construct a vector space from a given set of examples   * @param strings a list of strings from which the inverted index is   * to be constructed   */  public VectorSpaceMetric() {    m_stringRefHash = new HashMap();    m_tokenHash = new HashMap();    m_stringRefs = new ArrayList();  }    /** Given a list of strings, build the vector space   */  public void buildMetric(List strings) throws Exception {    m_stringRefHash = new HashMap();    m_tokenHash = new HashMap();    // Loop, processing each of the examples    Iterator stringIterator = strings.iterator();    while (stringIterator.hasNext()) {      String string = (String)stringIterator.next();      // Create a document vector for this document      HashMapVector vector = m_tokenizer.tokenize(string);      vector.initLength();      indexString(string, vector);    }    // Now that all strings have been processed, we can calculate the IDF weights for    // all tokens and the resulting lengths of all weighted document vectors.    computeIDFandStringLengths();    System.out.println("Indexed " +  m_stringRefs.size() + " documents with " + size() + " unique terms.");  }  /** Index a given string using its corresponding vector */  protected void indexString(String string, HashMapVector vector) {    // Create a new reference    StringReference strRef = new StringReference(string, vector);    m_stringRefs.add(strRef);        m_stringRefHash.put(string, strRef);    // Iterate through each of the tokens in the document    Iterator mapEntries = vector.iterator();    while (mapEntries.hasNext()) {      Map.Entry entry = (Map.Entry)mapEntries.next();      // An entry in the HashMap maps a token to a Weight      String token = (String)entry.getKey();      // The count for the token is in the value of the Weight      int count = (int)((Weight)entry.getValue()).getValue();      // Add an occurence of this token to the inverted index pointing to this document      indexToken(token, count, strRef);    }  }  /** Add a token occurrence to the index.   * @param token The token to index.   * @param count The number of times it occurs in the document.   * @param strRef A reference to the String it occurs in.   */  protected void indexToken(String token, int count, StringReference strRef) {    // Find this token in the index    TokenInfo tokenInfo = (TokenInfo)m_tokenHash.get(token);    if (tokenInfo == null) {      // If this is a new token, create info for it to put in the hashtable      tokenInfo = new TokenInfo();      m_tokenHash.put(token, tokenInfo);    }    // Add a new occurrence for this token to its info    tokenInfo.occList.add(new TokenOccurrence(strRef, count));  }  /** Compute the IDF factor for every token in the index and the length   * of the string vector for every string referenced in the index. */  protected void computeIDFandStringLengths() {    // Let N be the total number of documents indexed    double N = m_stringRefs.size();    // Iterate through each of the tokens in the index     Iterator mapEntries = m_tokenHash.entrySet().iterator();    while (mapEntries.hasNext()) {      // Get the token and the tokenInfo for each entry in the HashMap      Map.Entry entry = (Map.Entry)mapEntries.next();      String token = (String)entry.getKey();      TokenInfo tokenInfo = (TokenInfo)entry.getValue();      // Get the total number of strings in which this token occurs      double numStringRefs = tokenInfo.occList.size();       // Calculate the IDF factor for this token      double idf = Math.log(N/numStringRefs);      if (idf == 0.0) 	// If IDF is 0, then just remove this inconsequential token from the index	mapEntries.remove();      else {	tokenInfo.idf = idf;	// In order to compute document vector lengths,  sum the	// square of the weights (IDF * occurrence count) across	// every token occurrence for each document.	for(int i = 0; i < tokenInfo.occList.size(); i++) {	  TokenOccurrence occ = (TokenOccurrence)tokenInfo.occList.get(i);	  if (m_useIDF) { 	    occ.m_stringRef.m_length = occ.m_stringRef.m_length + Math.pow(idf*occ.m_count, 2);	  } else {	    occ.m_stringRef.m_length = occ.m_stringRef.m_length + occ.m_count * occ.m_count;	  }	}      }    }    // At this point, every document length should be the sum of the squares of    // its token weights.  In order to calculate final lengths, just need to    // set the length of every document reference to the square-root of this sum.    for(int i = 0; i < m_stringRefs.size(); i++) {      StringReference stringRef = (StringReference)m_stringRefs.get(i);      stringRef.m_length = Math.sqrt(stringRef.m_length);    }  }    /** Compute similarity between two strings   * @param s1 first string   * @param s2 second string   * @returns similarity between two strings   */  public double similarity(String s1, String s2) {    StringReference stringRef1 = (StringReference) m_stringRefHash.get(s1);    StringReference stringRef2 = (StringReference) m_stringRefHash.get(s2);    double length1 = stringRef1.m_length;    double length2 = stringRef1.m_length;    HashMapVector v1 = stringRef1.m_vector;    HashMapVector v2 = stringRef2.m_vector;    double similarity = 0;    if (length1 == 0 || length2 == 0) {      return 0;    }    Iterator mapEntries = v1.iterator();    while (mapEntries.hasNext()) {      // Get the token and the count for each token in the query      Map.Entry entry = (Map.Entry)mapEntries.next();      String token = (String)entry.getKey();      if (v2.hashMap.containsKey(token)) {	double count1 = ((Weight)entry.getValue()).getValue();	double count2 = ((Weight)v2.hashMap.get(token)).getValue();	TokenInfo tokenInfo = (TokenInfo) m_tokenHash.get(token);	// add this component unless it was killed (with idf=0)	if (tokenInfo != null) {	  double increment = count1 * count2;	  if (m_useIDF) {	    increment *= tokenInfo.idf * tokenInfo.idf;	  }	  similarity += increment;	}      }    }    similarity /= length1 * length2;    return similarity;  }  /** The computation of a metric can be either based on distance, or on similarity   * @returns false because dot product fundamentally computes similarity   */  public boolean isDistanceBased() {    return false;  }    /** Set the tokenizer to use   * @param tokenizer the tokenizer that is used   */  public void setTokenizer(Tokenizer tokenizer) {    m_tokenizer = tokenizer;  }  /** Get the tokenizer to use   * @return the tokenizer that is used   */  public Tokenizer getTokenizer() {    return m_tokenizer;  }  /** Turn IDF weighting on/off   * @param useIDF if true, all token weights will be weighted by IDF   */  public void setUseIDF(boolean useIDF) {    m_useIDF = useIDF;  }   /** check whether IDF weighting is on/off   * @return if true, all token weights are weighted by IDF   */  public boolean getUseIDF() {    return m_useIDF;  }   /** Return the number of tokens indexed.   * @return the number of tokens indexed*/  public int size() {    return m_tokenHash.size();  }  /**   * Returns distance between two strings using the current conversion   * type (CONVERSION_LAPLACIAN, CONVERSION_EXPONENTIAL, CONVERSION_UNIT, ...)   * @param string1 First string.   * @param string2 Second string.   * @exception Exception if distance could not be estimated.   */  public double distance (String string1, String string2) throws Exception {    switch (m_conversionType) {    case CONVERSION_LAPLACIAN:       return 1 / (1 + similarity(string1, string2));    case CONVERSION_UNIT:      return 2 * (1 - similarity(string1, string2));    case CONVERSION_EXPONENTIAL:      return Math.exp(-similarity(string1, string2));    default:      throw new Exception ("Unknown similarity to distance conversion method");    }  }    /**   * Set the type of similarity to distance conversion. Values other   * than CONVERSION_LAPLACIAN, CONVERSION_UNIT, or CONVERSION_EXPONENTIAL will be ignored   *    * @param type type of the similarity to distance conversion to use   */  public void setConversionType(SelectedTag conversionType) {    if (conversionType.getTags() == TAGS_CONVERSION) {      m_conversionType = conversionType.getSelectedTag().getID();    }  }  /**   * return the type of similarity to distance conversion   * @return one of CONVERSION_LAPLACIAN, CONVERSION_UNIT, or CONVERSION_EXPONENTIAL   */  public SelectedTag getConversionType() {    return new SelectedTag(m_conversionType, TAGS_CONVERSION);  }  /** Create a copy of this metric   * @return another VectorSpaceMetric with the same exact parameters as this  metric   */  public Object clone() {    VectorSpaceMetric metric = new VectorSpaceMetric();    metric.setConversionType(new SelectedTag(m_conversionType, TAGS_CONVERSION));    metric.setTokenizer(m_tokenizer);    metric.setUseIDF(m_useIDF);    return metric;  }    /**   * Gets the current settings of NGramTokenizer.   *   * @return an array of strings suitable for passing to setOptions()   */  public String [] getOptions() {    String [] options = new String [20];    int current = 0;    if (m_conversionType == CONVERSION_EXPONENTIAL) {      options[current++] = "-E";    } else if (m_conversionType == CONVERSION_UNIT) {      options[current++] = "-U";    }    if (m_useIDF) {      options[current++] = "-I";    }    options[current++] = "-T";    options[current++] = Utils.removeSubstring(m_tokenizer.getClass().getName(), "weka.deduping.metrics.");    if (m_tokenizer instanceof OptionHandler) {	String[] tokenizerOptions = ((OptionHandler)m_tokenizer).getOptions();	for (int i = 0; i < tokenizerOptions.length; i++) {	  options[current++] = tokenizerOptions[i];	}      }        while (current < options.length) {      options[current++] = "";    }    return options;  }  /**   * Parses a given list of options. Valid options are:<p>   *   * -S use stemming   * -R remove stopwords   * -N gram size   */  public void setOptions(String[] options) throws Exception {    // TODO  }  /**   * Returns an enumeration describing the available options.   *   * @return an enumeration of all the available options.   */  public Enumeration listOptions() {    Vector newVector = new Vector(0);    return newVector.elements();  }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -