📄 taglink.java
字号:
/**
* <p>Title: </p> TagLink string distance
*
* <p>Description: </p> This is a Hybrid string metric. Token scores are
* computed by our character based method (TagLinkToken).
* Matched token pairs are defined by Algorithm1.
* This hybrid string distance follows notation as described in Camacho & Salhi 2006.
*
* <p>Copyright: Copyright (c) 2005</p>
*
* @author Horacio Camacho
* @author Sam Chapman <a href="http://www.dcs.shef.ac.uk/~sam/">Website</a>, <a href="mailto:sam@dcs.shef.ac.uk">Email</a>. (modified code to optermise update to generics and 1.5 and to fit in SimMetrics)
*
* email: jhcama@essex.ac.uk
* www: http://privatewww.essex.ac.uk/~jhcama/
*
* address: Horacio Camacho,
* Department of Mathematical Sciences,
* University of Essex,
* Colchester,
* Wivenhoe Park,
* CO4 3SQ
* United Kingdom,
*
* @version 1.1
*/
package uk.ac.shef.wit.simmetrics.similaritymetrics;
import uk.ac.shef.wit.simmetrics.tokenisers.InterfaceTokeniser;
import uk.ac.shef.wit.simmetrics.tokenisers.TokeniserWhitespace;
import java.io.Serializable;
import java.util.*;
public final class TagLink extends AbstractStringMetric implements Serializable {
/**
* private idfMap contains the IDF weights for each token in the dataset.
*/
private HashMap<String,Float> idfMap;
/**
* private characterBasedStringMetric is the method that meassures similarity between tokens.
*/
private AbstractStringMetric characterBasedStringMetric;
/**
* private DEFAULT_METRIC is the default method that meassures similarity between tokens.
*/
private static final AbstractStringMetric DEFAULT_METRIC = new
TagLinkToken();
/**
* private tokeniser for tokenisation of the query strings.
*/
private final InterfaceTokeniser tokeniser;
/**
* a constant for calculating the estimated timing cost.
*/
private final float ESTIMATEDTIMINGCONST = 0.0006186370597243490f;
/**
* TagLink default constructor. IDF weights are all equally weighted.
* Transposition constant value is 0.3
*/
public TagLink() {
this(DEFAULT_METRIC);
}
/**
* TagLink constructor requires a character based string metric.
*
* @param characterBasedStringMetric CharacterBasedStringMetric
*/
public TagLink(AbstractStringMetric characterBasedStringMetric) {
this.characterBasedStringMetric = characterBasedStringMetric;
tokeniser = new TokeniserWhitespace();
}
/**
* TagLink constructor requires dataset data in order to compute the IDF
* weights. Default character based string metric is TagLinkToken.
*
* @param dataSetArray String[]
*/
public TagLink(String[] dataSetArray) {
this(dataSetArray, DEFAULT_METRIC);
}
/**
* TagLink constructor requires dataset data in order to compute the IDF
* weights. Also requires a character based string metric.
*
* @param dataSetArray String[]
* @param characterBasedStringMetric CharacterBasedStringMetric
*/
public TagLink(String[] dataSetArray,
AbstractStringMetric characterBasedStringMetric) {
this.characterBasedStringMetric = characterBasedStringMetric;
tokeniser = new TokeniserWhitespace();
this.idfMap = getIDFMap(dataSetArray);
}
/**
* getMinStringSize count the number of characters in String array tTokens and
* String array uTokens and return the minimun size.
*
* @param tTokens String[]
* @param uTokens String[]
* @return float
*/
private float getMinStringSize(String[] tTokens, String[] uTokens) {
float tSize = 0, uSize = 0;
for (String tToken : tTokens) {
tSize += tToken.length();
}
for (String uToken : uTokens) {
uSize += uToken.length();
}
return Math.min(tSize, uSize);
}
/**
* gets the estimated time in milliseconds it takes to perform a similarity timing.
*
* @param string1 string 1
* @param string2 string 2
* @return the estimated time in milliseconds taken to perform the similarity measure
*/
public float getSimilarityTimingEstimated(final String string1, final String string2) {
final float str1Length = string1.length();
final float str2Length = string2.length();
return (str1Length * str2Length) * ESTIMATEDTIMINGCONST;
}
/**
* gets the un-normalised similarity measure of the metric for the given strings.
*
* @param T string T to get unnormalised test from
* @param U string U to get unnormalised test from
* @return returns the score of the similarity measure (un-normalised)
*/
public float getUnNormalisedSimilarity(String T, String U) {
return getSimilarity(T, U);
}
/**
* getSimilarity computes the similarity between a pair of strings T and U.
*
* @param T String
* @param U String
* @return float
*/
public float getSimilarity(String T, String U) {
if (T.equals(U)) {
return 1.0f;
} else {
Vector<String> tVector = tokeniser.tokenize(T);
Vector<String> uVector = tokeniser.tokenize(U);
String[] tTokens = tVector.toArray(new String[tVector.size()]),
uTokens = uVector.toArray(new String[uVector.size()]);
float[] tIdfArray = getIDFArray(tTokens),
uIdfArray = getIDFArray(uTokens);
return algorithm1(tTokens, uTokens, tIdfArray, uIdfArray);
}
}
/**
* algorithm1 select the considered most appropiate token pairs and compute
* the sum of the selected pairs.
*
* @param tTokens String[]
* @param uTokens String[]
* @param tIdfArray float[]
* @param uIdfArray float[]
* @return float
*/
private float algorithm1(String[] tTokens, String[] uTokens,
float[] tIdfArray, float[] uIdfArray) {
ArrayList<Candidates> candidateList = obtainCandidateList(tTokens, uTokens, tIdfArray,
uIdfArray);
sortCandidateList(candidateList);
float scoreValue = 0.0f;
HashMap<Integer,Object> tMap = new HashMap<Integer, Object>();
HashMap<Integer,Object> uMap = new HashMap<Integer, Object>();
for (Object aCandidateList : candidateList) {
Candidates actualCandidates = (Candidates) aCandidateList;
Integer tPos = actualCandidates.getTPos();
Integer uPos = actualCandidates.getUPos();
if ((!tMap.containsKey(tPos)) &&
(!uMap.containsKey(uPos))) {
scoreValue += actualCandidates.getScore();
tMap.put(tPos, null);
uMap.put(uPos, null);
}
}
return scoreValue;
}
/**
* explainStringMetric gives a brief explanation of how the stringMetric was
* computed.
*
* @param T String
* @param U String
* @return String
*/
public String getSimilarityExplained(String T, String U) {
StringBuffer buff = new StringBuffer();
buff.append("\n\t*****TagLink String Distance*****");
if (T.equals(U)) {
buff.append("\nS(T,U)=1.0\n");
} else {
Vector<String> tVector = tokeniser.tokenize(T);
Vector<String> uVector = tokeniser.tokenize(U);
String[] tTokens = tVector.toArray(new String[tVector.size()]),
uTokens = uVector.toArray(new String[uVector.size()]);
buff.append("\nT={");
for (String tToken : tTokens) {
buff.append(tToken).append(", ");
}
buff.append("}\n");
buff.append("U={");
for (String uToken : uTokens) {
buff.append(uToken).append(", ");
}
buff.append("}\n");
float minStringSize = getMinStringSize(tTokens, uTokens);
buff.append("min(|T|,|U|)=").append(minStringSize).append("\n");
buff.append("\nIDF weights:\n");
buff.append("Ti\tai(Ti)\n");
float[] tIdfArray = getIDFArray(tTokens),
uIdfArray = getIDFArray(uTokens);
for (int i = 0; i < tIdfArray.length; i++) {
buff.append(tTokens[i]).append("\t").append(round(tIdfArray[i])).append("\n");
}
buff.append("\nUj\taj(Uj)\n");
for (int i = 0; i < uIdfArray.length; i++) {
buff.append(uTokens[i]).append("\t").append(round(uIdfArray[i])).append("\n");
}
buff.append("\nScores:\n");
buff.append("Ti\tUj\tSij(Ti,Uj)\tIDFij(Ti,Uj)\tMRij(Ti,Uj)\tSij\n");
ArrayList<Candidates> candidateList = new ArrayList<Candidates>();
for (int t = 0; t < tTokens.length; t++) {
int lastTr = -1;
for (int u = 0, flag = 0; u < uTokens.length && flag == 0; u++) {
int tr = Math.abs(t - u);
if (lastTr >= 0 && lastTr < tr) {
flag = 1;
} else {
String tTok = tTokens[t], uTok = uTokens[u];
float innerScore = characterBasedStringMetric.getSimilarity(tTok,
uTok);
if (innerScore >= 0.0) {
float MR;
if (innerScore == 1.0) {
MR = tTokens[t].length();
} else {
MR = ((TagLinkToken) characterBasedStringMetric).getMatched();
}
MR = MR / minStringSize;
float IDF = tIdfArray[t] * uIdfArray[u],
weight = (IDF + MR) / 2.0f;
if (innerScore == 1) {
lastTr = tr;
}
buff.append(tTok).append("\t").append(uTok).append("\t").append(round(innerScore)).append("\t").append(round(IDF)).append("\t").append(round(MR)).append("\t").append(round(innerScore * weight)).append("\n");
candidateList.add(new Candidates(t, u, innerScore * weight));
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -