substringfingerprint.java

来自「dump3 morpheus 0.2.9 src」· Java 代码 · 共 351 行

JAVA
351
字号
/**
 * DuMP3 version morpheus_0.2.9 - a duplicate/similar file finder in Java<BR>
 * Copyright 2005 Alexander Gr&auml;sser<BR>
 * All Rights Reserved, http://dump3.sourceforge.net/<BR>
 * <BR>
 * This file is part of DuMP3.<BR>
 * <BR>
 * DuMP3 is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software
 * Foundation; either version 2 of the License, or (at your option) any later version.<BR>
 * <BR>
 * DuMP3 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 * PARTICULAR PURPOSE. See the GNU General Public License for more details.<BR>
 * <BR>
 * You should have received a copy of the GNU General Public License along with DuMP3; if not, write to the Free Software Foundation, Inc., 51 Franklin St,
 * Fifth Floor, Boston, MA 02110-1301 USA
 */
package net.za.grasser.duplicate.fingerprint;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import net.za.grasser.duplicate.file.FingerprintFile;
import net.za.grasser.duplicate.file.Status;
import net.za.grasser.duplicate.fingerprint.configure.ConfigFactory;
import net.za.grasser.duplicate.fingerprint.configure.SubstringFingerprintConfig;
import net.za.grasser.duplicate.util.Constants;
import org.apache.log4j.Logger;

/**
 * This class is used to calculate a fingerprint for a text file.<BR>
 * It does this by finding a set of the n rarest (m character) substrings in the file.<BR>
 * When the file is compared to another, a Jaccard coefficient is calculated on this set and a % match is returned.<BR>
 * REF: <a
 * href="http://www.cs.tau.ac.il/~matias/courses/Seminar_Spring03/Estimating%20Rarity%20and%20Similarity%20over%20Data%20stream%20Windows.ppt">Estimating rarity
 * and similarity over Windowed data streams</a>, Yossi Matias, Spring 2003<BR>
 * REF: <a href="http://www-2.cs.cmu.edu/afs/cs/user/nch/www/koala/main.html">Scalable Document Fingerprinting (Extended Abstract)</a>, Nevin Heintze, Oct.
 * 1996<BR>
 * 
 * @author <a href="http://sourceforge.net/sendmessage.php?touser=733840">pyropunk at sourceforge dot net</a>
 * @version $Revision: 1.11 $
 * @modelguid {87AC7397-9A44-4C1C-B351-255DFA8ACEA3}
 */
public class SubstringFingerprint extends AbstractFingerprint {
  /**
   * <code>log</code> SubstringFingerprint -
   */
  private static final Logger log = Logger.getLogger(SubstringFingerprint.class);

  /**
   * This class is used internally for the entries into the fingerprint map.
   * 
   * @author <a href="http://sourceforge.net/sendmessage.php?touser=733840">pyropunk at sourceforge dot net</a>
   * @version $Revision: 1.11 $
   */
  static class Entry implements Comparable<Object> {
    /**
     * <code>key</code> Entry -
     */
    public String key = null;
    /**
     * <code>value</code> Entry -
     */
    public int value = 0;

    /**
     * Constructor
     * 
     * @param pBuf
     * @param pOff
     */
    Entry(final byte[] pBuf, final int pOff) {
      key = new String(pBuf, pOff, ANCHOR_LENGTH);
      value = 0;
      for (int i = 0; i < Constants.SIZE_OF_INT; i++) {
        value += (pBuf[(pOff + ANCHOR_LENGTH + i)] & 0xFF) << (i << 3);
      }
    }

    /**
     * Constructor
     * 
     * @param pKey
     * @param pVal
     */
    Entry(final String pKey, final int pVal) {
      key = pKey;
      value = pVal;
    }

    /**
     * @see java.lang.Comparable#compareTo(java.lang.Object)
     */
    public int compareTo(final Object arg0) {
      if (arg0 instanceof Entry) {
        return value - ((Entry)arg0).value;
      }
      throw new ClassCastException();
    }

    /**
     * @see java.lang.Object#equals(java.lang.Object)
     */
    @Override
    public boolean equals(final Object arg0) {
      return compareTo(arg0) == 0;
    }

    /**
     * @return byte[]
     */
    public byte[] getBytes() {
      final byte[] ret = new byte[(ANCHOR_LENGTH + Constants.SIZE_OF_INT)];
      final byte[] a = key.getBytes();
      for (int i = 0; i < ANCHOR_LENGTH; i++) {
        ret[i] = a[i];
      }
      for (int i = 0; i < Constants.SIZE_OF_INT; i++) {
        ret[(ANCHOR_LENGTH + i)] = (byte)(value >> (i << 3) & 0xFF);
      }
      return ret;
    }

    /**
     * increment value
     */
    public void inc() {
      value++;
    }

    /**
     * @see java.lang.Object#toString()
     */
    @Override
    public String toString() {
      return key + '#' + value;
    }
  }

  /**
   * <code>ANCHOR_LENGTH</code> SubstringFingerprint - substring length
   */
  protected static int ANCHOR_LENGTH = 6;
  /**
   * <code>HASH_LENGTH</code> SubstringFingerprint - fingerprint subsection length
   */
  protected static int HASH_LENGTH = ANCHOR_LENGTH + Constants.SIZE_OF_INT;
  /**
   * <code>config</code> SubstringFingerprint -
   */
  private static final SubstringFingerprintConfig config = (SubstringFingerprintConfig)ConfigFactory.getConfig(SubstringFingerprint.class);

  /**
   * make fingerprint hash from map<BR>
   * using least used substring hashing
   * 
   * @param list List
   * @return byte[]
   */
  private static byte[] makeHash(final List<Entry> list) {
    final int l = list.size() > config.getFingerLength() ? config.getFingerLength() : list.size();
    final byte[] fprint = new byte[(l * HASH_LENGTH)];
    for (int i = 0; i < l; i++) {
      final byte[] a = list.get(i).getBytes();
      for (int j = 0; j < HASH_LENGTH; j++) {
        fprint[i * HASH_LENGTH + j] = a[j];
      }
    }
    return fprint;
  }

  /**
   * reconstruct map from fingerprint hash
   * 
   * @param buffer
   * @return HashMap
   */
  protected static HashMap<String, Entry> makeMap(final byte[] buffer) {
    final HashMap<String, Entry> ret = new HashMap<String, Entry>();
    for (int i = 0; i < buffer.length / HASH_LENGTH; i++) {
      final Entry in = new Entry(buffer, (i * HASH_LENGTH));
      ret.put(in.key, in);
    }
    return ret;
  }

  /**
   * <code>fingermap</code> SubstringFingerprint - hastable of substrings to number of occurences
   */
  HashMap<String, Entry> fingermap = null;
  /**
   * <code>pos</code> SubstringFingerprint - position in lastSeen array.
   */
  int pos = 0;
  /**
   * <code>lastSeen</code> SubstringFingerprint -
   */
  String[] lastSeen = null;

  /**
   * @param fi FingerprintFile
   * @modelguid {53F63DBA-703A-4B81-882B-F1B4318D5CC9}
   */
  public SubstringFingerprint(final FingerprintFile fi) {
    super(fi);
  }

  /**
   * do 'Jaccard coefficient' calculation on set of values<BR>
   * TODO: maybe adjust by the number of occurrences
   * 
   * @param fi
   * @return int - percentage
   */
  private float compare(final SubstringFingerprint fi) {
    final Set<String> setU = new HashSet<String>();
    final Set<String> setI = new HashSet<String>();
    setU.addAll(fingermap.keySet());
    setU.addAll(fi.fingermap.keySet());
    setI.addAll(fingermap.keySet());
    setI.retainAll(fi.fingermap.keySet());
    return (float)setI.size() / (float)setU.size() * 100.0f;
  }

  /**
   * @return String
   * @modelguid {E440866E-6C37-49CC-8B8A-E6645E19C3B1}
   */
  @Override
  public String getClassName() {
    return super.getClassName() + '-' + config.getFingerLength();
  }

  /**
   * @see net.za.grasser.duplicate.fingerprint.AbstractFingerprint#getSimilarityThreshhold()
   */
  @Override
  public float getSimilarityThreshhold() {
    return config.getSimilarityThreshhold();
  }

  /**
   * @param fi AbstractFingerprint
   * @return boolean
   * @modelguid {3E6D5504-3A75-4444-BE5B-C23BB0DF4DE8}
   */
  @Override
  public float matches(final AbstractFingerprint fi) {
    final FingerprintFile ff1 = getFile();
    final FingerprintFile ff2 = fi.getFile();
    if (ff1.getStatus() != Status.FILE_OK && ff1.getStatus() != Status.FILE_SIGNATURE_MISMATCH || ff2.getStatus() != Status.FILE_OK
        && ff2.getStatus() != Status.FILE_SIGNATURE_MISMATCH) {
      return 0.0f;
    }
    if (ff1.getLength() == 0L && ff2.getLength() == 0L) {
      return 100.0f;
    }
    if (fi instanceof SubstringFingerprint) {
      final SubstringFingerprint tf = (SubstringFingerprint)fi;
      // try to read the files
      getFingerprint();
      tf.getFingerprint();
      if ((ff1.getStatus() == Status.FILE_OK || ff1.getStatus() == Status.FILE_SIGNATURE_MISMATCH)
          && (ff2.getStatus() == Status.FILE_OK || ff2.getStatus() == Status.FILE_SIGNATURE_MISMATCH)) {
        if (fingermap == null) {
          fingermap = makeMap(getFingerprint());
        }
        if (tf.fingermap == null) {
          tf.fingermap = makeMap(tf.getFingerprint());
        }
        final float res = compare(tf);
        if (log.isTraceEnabled()) {
          log.trace(ff1.getPath() + "=" + res + "=" + ff2.getPath());
        }
        return res;
      }
    }
    return 0.0f;
  }

  /**
   * teardown phase after reading the file<BR>
   * fingerprint must be computed here
   */
  @Override
  protected void postRead() {
    // don't waste space
    lastSeen = null;
    // determine least occuring strings
    final List<Entry> list = new ArrayList<Entry>();
    final Iterator<Entry> it = fingermap.values().iterator();
    while (it.hasNext()) {
      list.add(it.next());
    }
    Collections.sort(list);
    if (list.size() > config.getFingerLength()) {
      for (int i = config.getFingerLength(); i < list.size(); i++) {
        fingermap.remove(list.get(i).key);
      }
    }
    // make fingerprint
    fingerprint = makeHash(list);
  }

  /**
   * setup phase before reading the file
   */
  @Override
  protected void preRead() {
    fingermap = new HashMap<String, Entry>();
    lastSeen = new String[ANCHOR_LENGTH];
  }

  /**
   * add substrings to the fingerprint map
   * 
   * @param pBuffer
   * @param pLength
   */
  @Override
  protected void update(final byte[] pBuffer, final int pLength) {
    String s = null;
    for (int i = 0; i < pLength - ANCHOR_LENGTH; i++) {
      s = new String(pBuffer, i, ANCHOR_LENGTH);
      boolean notfound = true;
      for (int j = 0; j < ANCHOR_LENGTH; j++) {
        final int p = (j + pos) % ANCHOR_LENGTH;
        if (lastSeen[p] != null && s.equals(lastSeen[p])) {
          notfound = false;
          break;
        }
      }
      pos = (pos + 1) % ANCHOR_LENGTH;
      if (notfound) {
        lastSeen[pos] = s;
        final Object val = fingermap.get(s);
        if (val == null) {
          fingermap.put(s, new Entry(s, 1));
        } else {
          ((Entry)val).inc();
        }
      } else {
        lastSeen[pos] = null;
      }
    }
  }
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?