⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 taglink.java

📁 SimMetrics is a Similarity Metric Library, e.g. from edit distance s (Levenshtein, Gotoh, Jaro etc)
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
            }
            sortCandidateList(candidateList);

            // iterate the candidate list
            buff.append("\nCommon tokens (Algorithm 1):\n");
            buff.append("Ti\tUj\tSij*Xij\n");
            float score = 0.0f;
            HashMap<Integer,Object> tMap = new HashMap<Integer, Object>();
            HashMap<Integer,Object> uMap = new HashMap<Integer, Object>();
            for (Object aCandidateList : candidateList) {
                Candidates actualCandidates = (Candidates) aCandidateList;
                Integer tPos = actualCandidates.getTPos();
                Integer uPos = actualCandidates.getUPos();
                if ((!tMap.containsKey(tPos)) &&
                        (!uMap.containsKey(uPos))) {
                    float tokenScore = actualCandidates.getScore();
                    score += tokenScore;
                    tMap.put(tPos, null);
                    uMap.put(uPos, null);
                    buff.append(tTokens[(tPos)]).append("\t").append(uTokens[(uPos)]).append("\t").append(round(tokenScore)).append("\n");
                }
            }
            buff.append("\nS(T,U)=").append(round(score)).append("\n");
        }
        return buff.toString();
    }

    /**
     * obtainCandidateList set a candidate list of pair of tokens. Sometimes it
     * will not compute all candidate pairs in oder to reduce the computational
     * cost.
     *
     * @param tTokens   String[]
     * @param uTokens   String[]
     * @param tIdfArray float[]
     * @param uIdfArray float[]
     * @return ArrayList
     */
    private ArrayList<Candidates> obtainCandidateList(String[] tTokens, String[] uTokens,
                                          float[] tIdfArray, float[] uIdfArray) {
        ArrayList<Candidates> candidateList = new ArrayList<Candidates>();
        float minStringSize = getMinStringSize(tTokens, uTokens);
        for (int t = 0; t < tTokens.length; t++) {
            int lastTr = -1;
            for (int u = 0, flag = 0; u < uTokens.length && flag == 0; u++) {
                int tr = Math.abs(t - u);
                if (lastTr >= 0 && lastTr < tr) {
                    flag = 1;
                } else {
                    String tTok = tTokens[t], uTok = uTokens[u];
                    float innerScore = characterBasedStringMetric.getSimilarity(tTok,
                            uTok);
                    if (innerScore >= 0.0f) {
                        float matched;
                        if (innerScore == 1.0f) {
                            matched = tTokens[t].length();
                        } else {
                            matched = ((TagLinkToken) characterBasedStringMetric).getMatched();
                        }
                        float weightMatched = matched / minStringSize,
                                weightTFIDF = tIdfArray[t] * uIdfArray[u],
                                weight = (weightTFIDF + weightMatched) / 2.0f;
                        if (innerScore == 1.0f) {
                            lastTr = tr;
                        }
                        candidateList.add(new Candidates(t, u, innerScore * weight));
                    }
                }
            }
        }
        return candidateList;
    }

    /**
     * sortCandidateList sort a list of candidate pair of tokens.
     *
     * @param list ArrayList of candidates
     */
    private void sortCandidateList(ArrayList<Candidates> list) {
        java.util.Collections.sort(list, new java.util.Comparator() {
            public int compare(Object o1, Object o2) {
                // First sort, by score in index
                float scoreT = ((Candidates) o1).getScore();
                float scoreU = ((Candidates) o2).getScore();
                if (scoreU > scoreT) {
                    return 1;
                }
                if (scoreU < scoreT) {
                    return -1;
                }
                return 0;
            }
        }
        );
    }

    /**
     * getIDFArray normalize a vector of IDF weights.
     *
     * @param tokenArray String[]
     * @return float[]
     */
    private float[] getIDFArray(String[] tokenArray) {
        int tokenArrayLength = tokenArray.length;
        float[] IDFArray = new float[tokenArrayLength];
        if (idfMap == null) {
            float cosineWeight = 1.0f / ((float) Math.sqrt(tokenArrayLength));
            for (int i = 0; i < tokenArrayLength; i++) {
                IDFArray[i] = cosineWeight;
            }
        } else {
            float sq = 0f;
            for (int i = 0; i < tokenArrayLength; i++) {
                String actualToken = tokenArray[i];
                float idfWeight = 0.0f;
                try {
                    idfWeight = (idfMap.get(actualToken));
                }
                catch (Exception e) {
                    //SAM added this as the catch was unguarded.
                    e.printStackTrace();
                }
                IDFArray[i] = idfWeight;
                sq += idfWeight * idfWeight;
            }
            sq = (float) Math.sqrt(sq);
            for (int i = 0; i < tokenArrayLength; i++) {
                IDFArray[i] = IDFArray[i] / sq;
            }
        }
        return IDFArray;
    }

    /**
     * returns the long string identifier for the metric.
     *
     * @return the long string identifier for the metric
     */
    public String getLongDescriptionString() {
        return getShortDescriptionString();
    }

    /**
     * getShortDescriptionString returns the name and parameters of this string metric
     *
     * @return String
     */
    public String getShortDescriptionString() {
        if (idfMap == null) {
            return "[TagLink_[" + characterBasedStringMetric.toString() +
                    "]";
        } else {
            return "[TagLink_IDF_[" + characterBasedStringMetric.toString() +
                    "]";
        }

    }

    /**
     * getIDFMap compute the IDF weights for the dataset provided.
     *
     * @param dataSetArray String[]
     */
    private HashMap<String,Float> getIDFMap(String[] dataSetArray) {
        float N = dataSetArray.length;
        HashMap<String,Float> idfMap = new HashMap<String, Float>();
        for (int row = 0; row < N; row++) {
            HashMap<String,Object> rowMap = new HashMap<String, Object>();
            HashMap<String,Float> freqMap = new HashMap<String, Float>();
            String actualRow = dataSetArray[row];
            Vector<String> tokenVector = tokeniser.tokenize(actualRow);
            String[] rowArray = tokenVector.toArray(new String[tokenVector.size()]);
            for (String actualToken : rowArray) {
                rowMap.put(actualToken, null);

                float actualFrequency = getFrequency(actualToken, freqMap) +
                        1.0f;
                freqMap.put(actualToken, actualFrequency);
            }
            Collection<String> entries = rowMap.keySet();
            for (String actualToken : entries) {
                float actualFrequency = getFrequency(actualToken, idfMap) +
                        1.0f;
                idfMap.put(actualToken, actualFrequency);
            }

        }

        Collection<Map.Entry<String,Float>> entries = idfMap.entrySet();
        Map.Entry<String,Float> ent;
        for (Map.Entry<String, Float> entry : entries) {
            ent = entry;
            String key = ent.getKey();
            float frequency = ent.getValue();
            float idf = (float) Math.log((N / frequency) + 1.0f);
            idfMap.put(key, idf);
        }
        return idfMap;
    }

    /**
     * getFrequency retrieve the value of the map.
     *
     * @param word String
     * @param map  Map
     * @return float
     */
    private float getFrequency(String word, Map<String,Float> map) {
        Float frequency = map.get(word);
        if (frequency == null) {
            return 0;
        }
        return frequency;
    }

    /**
     * round a float number.
     *
     * @param number float
     * @return float
     */
    private float round(float number) {
        int round = (int) (number * 1000.00f);
        float rest = (number * 1000.00f) - round;
        if (rest >= 0.5f) {
            round++;
        }
        return (round / 1000.00f);
    }
}

class Candidates implements Serializable {
    private int tPos, uPos;
    private float score;

    /**
     * Candidates constructor. Creates an instance of a candidate string pair T and U. It
     * requires the position of the pair in the string and the score or distance
     * between them.
     *
     * @param tPos  int
     * @param uPos  int
     * @param score float
     */
    public Candidates(int tPos, int uPos, float score) {
        this.tPos = tPos;
        this.uPos = uPos;
        this.score = score;
    }

    /**
     * getTPos, return the position of string T.
     *
     * @return int
     */
    public int getTPos() {
        return tPos;
    }

    /**
     * getUPos, return the position of string U.
     *
     * @return int
     */
    public int getUPos() {
        return uPos;
    }

    /**
     * getScore, return the score or distance between strings T and U.
     *
     * @return float
     */
    public float getScore() {
        return score;
  }

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -