siffingerprint.java

来自「dump3 morpheus 0.2.9 src」· Java 代码 · 共 331 行

JAVA
331
字号
/**
 * DuMP3 version morpheus_0.2.9 - a duplicate/similar file finder in Java<BR>
 * Copyright 2005 Alexander Gr&auml;sser<BR>
 * All Rights Reserved, http://dump3.sourceforge.net/<BR>
 * <BR>
 * This file is part of DuMP3.<BR>
 * <BR>
 * DuMP3 is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software
 * Foundation; either version 2 of the License, or (at your option) any later version.<BR>
 * <BR>
 * DuMP3 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 * PARTICULAR PURPOSE. See the GNU General Public License for more details.<BR>
 * <BR>
 * You should have received a copy of the GNU General Public License along with DuMP3; if not, write to the Free Software Foundation, Inc., 51 Franklin St,
 * Fifth Floor, Boston, MA 02110-1301 USA
 */
package net.za.grasser.duplicate.fingerprint;

import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import net.za.grasser.duplicate.file.FingerprintFile;
import net.za.grasser.duplicate.file.Status;
import net.za.grasser.duplicate.fingerprint.configure.ConfigFactory;
import net.za.grasser.duplicate.fingerprint.configure.SifFingerprintConfig;
import net.za.grasser.duplicate.util.HexArray;
import org.apache.log4j.Logger;

/**
 * This class is used to calculate a fingerprint for a text file.<BR>
 * It does this by finding a 'sif' fingerprints in the file.<BR>
 * Limit the fingerprint length to x fingerprints per file.<BR>
 * When the file is compared to another, a Jaccard coefficient is calculated on this set and a % match is returned.<BR>
 * REF: <a href="http://webglimpse.net/pubs/TR93-33.pdf">Finding Similar Files in a Large File System</a>, Udi Manber, Oct 1993<BR>
 * 
 * @author <a href="http://sourceforge.net/sendmessage.php?touser=733840">pyropunk at sourceforge dot net</a>
 * @version $Revision: 1.11 $
 * @modelguid {87AC7397-9A44-4C1C-B351-255DFA8ACEA3}
 */
public class SifFingerprint extends AbstractFingerprint {
  /**
   * <code>log</code> SubstringFingerprint -
   */
  private static final Logger log = Logger.getLogger(SifFingerprint.class);
  /**
   * <code>parr</code> SifFingerprint - precalculated array of (p^i) mod M
   */
  protected static int[] parr = null;
  /**
   * <code>arr</code> SifFingerprint - precalculated array of (i*p^49) mod M
   */
  protected static int[] arr = null;
  /**
   * <code>ANCHOR_LENGTH</code> SifFingerprint - substring length<BR>
   */
  protected static int ANCHOR_LENGTH = 50;
  /**
   * <code>M</code> SifFingerprint - Modulus currently set to 2^30<BR>
   */
  protected static long M = 1073741824L;
  /**
   * <code>p</code> SifFingerprint - a prime - kept low for calc speed<BR>
   */
  protected static int p = 11;
  /**
   * <code>k</code> SifFingerprint - number of bits to rip off (keep static)
   */
  protected static final int k = 8;
  /**
   * <code>config</code> SifFingerprint -
   */
  private static final SifFingerprintConfig config = (SifFingerprintConfig)ConfigFactory.getConfig(SifFingerprint.class);
  /**
   * certain things can be precomputed since (a mod x)(b mod x) = (ab) mod x<BR>
   * (p^i) mod M - for all i between 0 and 49<BR>
   * (i*p^49) mod M - for all i between 0 and 255<BR>
   */
  static {
    parr = new int[ANCHOR_LENGTH];
    for (int i = 0; i < parr.length; i++) {
      parr[i] = (new BigInteger("" + p)).pow(i).mod(new BigInteger("" + M)).intValue();
    }
    arr = new int[256];
    for (int i = 0; i < arr.length; i++) {
      arr[i] = (new BigInteger("" + i)).multiply(new BigInteger("" + parr[49])).mod(new BigInteger("" + M)).intValue();
    }
  }

  /**
   * calculates a fingerprint hash set from a byte[] <BR>
   * assumes k = 8 and a LittleEndian architecture!
   * 
   * @param pFinger byte[]
   * @return HashSet
   */
  public static HashSet<Integer> makeFingermapFromPrint(final byte[] pFinger) {
    final HashSet<Integer> fingerset = new HashSet<Integer>();
    for (int i = 0; i < pFinger.length; i += 3) {
      int temp = pFinger[i + 2] & 0xff;
      temp += pFinger[i + 1] << 8 & 0xff00;
      temp += pFinger[i] << 16 & 0xff0000;
      fingerset.add(new Integer(temp));
    }
    return fingerset;
  }

  /**
   * calculates a byte[] from a fingerprint hash set<Br>
   * assumes k = 8 and a LittleEndian architecture!<BR>
   * adjusts the hashset to have values with 8 high bits set to 0
   * 
   * @param pHash HashSet
   * @return byte[]
   */
  public static byte[] makeFingerprintFromHash(final HashSet<Integer> pHash) {
    String fp = "";
    final HashSet<Integer> s = new HashSet<Integer>();
    Integer bi = null;
    final List<Integer> list = new ArrayList<Integer>(pHash);
    Collections.sort(list);
    // limit fingerprint length to FINGER_LENGTH smallest hashes
    int i = 0;
    final Iterator<Integer> it = list.iterator();
    while (it.hasNext() && i < config.getFingerLength()) {
      bi = it.next();
      String str = null;
      final int temp = bi.intValue() & 0x7fffffff;
      if (temp < 0x00ffffff) {
        s.add(new Integer((temp & 0x00ffffff)));
        str = "000000" + Integer.toHexString((temp & 0x00ffffff));
      } else {
        s.add(new Integer((bi.intValue() >> k & 0x00ffffff)));
        str = "000000" + Integer.toHexString((bi.intValue() >> k & 0x00ffffff));
      }
      fp += str.substring(str.length() - 6, str.length());
      i++;
    }
    pHash.clear();
    pHash.addAll(s);
    // make fingerprint
    return HexArray.makeBytes(fp);
  }

  /**
   * <code>fingerfound</code> SifFingerprint - found a fingerprint with last k bits = 0
   */
  protected boolean fingerfound = false;
  /**
   * <code>fingermap</code> SubstringFingerprint - hashset of modulus fingerprints
   */
  HashSet<Integer> fingermap = null;
  /**
   * <code>last</code> SubstringFingerprint - last number of bytes so we can continue calculating<BR>
   */
  byte[] last = null;

  /**
   * @param fi FingerprintFile
   * @modelguid {53F63DBA-703A-4B81-882B-F1B4318D5CC9}
   */
  public SifFingerprint(final FingerprintFile fi) {
    super(fi);
  }

  /**
   * add a fingerprint to the set<BR>
   * once a fingerprint with last k bits = 0 is found, clear the set and only add fingerprints with last k bits = 0<BR>
   * in theory this limits the set to +-255 entries and saves space.
   * 
   * @param pInt BigInteger
   */
  private void add(final BigInteger pInt) {
    if (fingerfound) {
      if (pInt.getLowestSetBit() >= k) {
        fingermap.add(new Integer(pInt.intValue()));
      }
    } else {
      if (pInt.getLowestSetBit() >= k) {
        fingermap.clear();
      }
      fingermap.add(new Integer(pInt.intValue()));
    }
  }

  /**
   * do 'Jaccard coefficient' calculation on set of values<BR>
   * TODO: maybe adjust by the number of occurrences
   * 
   * @param fi
   * @return int - percentage
   */
  private float compare(final SifFingerprint fi) {
    final Set<Integer> setU = new HashSet<Integer>();
    final Set<Integer> setI = new HashSet<Integer>();
    setU.addAll(fingermap);
    setU.addAll(fi.fingermap);
    setI.addAll(fingermap);
    setI.retainAll(fi.fingermap);
    return (float)setI.size() / (float)setU.size() * 100.0f;
  }

  /**
   * @return String
   * @modelguid {E440866E-6C37-49CC-8B8A-E6645E19C3B1}
   */
  @Override
  public String getClassName() {
    return super.getClassName() + '-' + config.getFingerLength();
  }

  /**
   * @see net.za.grasser.duplicate.fingerprint.AbstractFingerprint#getSimilarityThreshhold()
   */
  @Override
  public float getSimilarityThreshhold() {
    return config.getSimilarityThreshhold();
  }

  /**
   * @param fi AbstractFingerprint
   * @return boolean
   * @modelguid {3E6D5504-3A75-4444-BE5B-C23BB0DF4DE8}
   */
  @Override
  public float matches(final AbstractFingerprint fi) {
    final FingerprintFile ff1 = getFile();
    final FingerprintFile ff2 = fi.getFile();
    if (ff1.getStatus() != Status.FILE_OK && ff1.getStatus() != Status.FILE_SIGNATURE_MISMATCH || ff2.getStatus() != Status.FILE_OK
        && ff2.getStatus() != Status.FILE_SIGNATURE_MISMATCH) {
      return 0.0f;
    }
    if (ff1.getLength() == 0L && ff2.getLength() == 0L) {
      return 100.0f;
    }
    if (fi instanceof SifFingerprint) {
      final SifFingerprint tf = (SifFingerprint)fi;
      // try to read the files
      getFingerprint();
      tf.getFingerprint();
      if ((ff1.getStatus() == Status.FILE_OK || ff1.getStatus() == Status.FILE_SIGNATURE_MISMATCH)
          && (ff2.getStatus() == Status.FILE_OK || ff2.getStatus() == Status.FILE_SIGNATURE_MISMATCH)) {
        if (fingermap == null) {
          fingermap = makeFingermapFromPrint(getFingerprint());
        }
        if (tf.fingermap == null) {
          tf.fingermap = makeFingermapFromPrint(tf.getFingerprint());
        }
        final float res = compare(tf);
        if (log.isTraceEnabled()) {
          log.trace(ff1.getPath() + "=" + res + "=" + ff2.getPath());
        }
        return res;
      }
    }
    return 0.0f;
  }

  /**
   * teardown phase after reading the file<BR>
   * fingerprint must be computed here
   */
  @Override
  protected void postRead() {
    fingerprint = makeFingerprintFromHash(fingermap);
    // don't waste space
    last = null;
  }

  /**
   * setup phase before reading the file
   */
  @Override
  protected void preRead() {
    fingermap = new HashSet<Integer>();
  }

  /**
   * calculate the fingerprints and add them to the fingerprint map
   * 
   * @param pBuffer
   * @param pLength
   */
  @Override
  protected void update(final byte[] pBuffer, final int pLength) {
    if (pLength >= ANCHOR_LENGTH) {
      BigInteger temp = new BigInteger("0");
      if (last == null) {
        // F1 = (t0*p^49 + t1*p^48 + ... + t49) mod M
        for (int i = 0; i < ANCHOR_LENGTH; i++) {
          temp = temp.add((new BigInteger("" + pBuffer[i])).multiply(new BigInteger("" + parr[ANCHOR_LENGTH - i - 1])));
        }
        temp = temp.mod(new BigInteger("" + M));
        add(temp);
      } else {
        // continue calculating
        for (int i = 0; i < ANCHOR_LENGTH; i++) {
          final int ch = last[i] & 0xFF;
          temp = temp.multiply(new BigInteger("" + p)).add(new BigInteger("" + pBuffer[i])).subtract(new BigInteger("" + arr[ch]));
          temp = temp.mod(new BigInteger("" + M));
          add(temp);
        }
      }
      // F2 = (p*F1 + t50 - t0*p^49) mod M
      for (int i = ANCHOR_LENGTH; i < pLength - ANCHOR_LENGTH; i++) {
        final int ch = pBuffer[(i - ANCHOR_LENGTH)] & 0xFF;
        temp = temp.multiply(new BigInteger("" + p)).add(new BigInteger("" + pBuffer[i])).subtract(new BigInteger("" + arr[ch]));
        temp = temp.mod(new BigInteger("" + M));
        add(temp);
      }
      int j = 0;
      last = new byte[ANCHOR_LENGTH];
      for (int i = pLength - ANCHOR_LENGTH; i < pLength; i++) {
        last[j++] = pBuffer[i];
      }
    } else {
      // file too short for normal fingerprinting
      BigInteger temp = new BigInteger("0");
      // F1 = (t0*p^49 + t1*p^48 + ... + tn*p^(49-len)) mod M
      for (int i = 0; i < pLength; i++) {
        temp = temp.add((new BigInteger("" + pBuffer[i])).multiply(new BigInteger("" + parr[ANCHOR_LENGTH - i - 1])));
      }
      temp = temp.mod(new BigInteger("" + M));
      add(temp);
    }
  }
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?