siffingerprint.java
来自「dump3 morpheus 0.2.9 src」· Java 代码 · 共 331 行
JAVA
331 行
/**
* DuMP3 version morpheus_0.2.9 - a duplicate/similar file finder in Java<BR>
* Copyright 2005 Alexander Grässer<BR>
* All Rights Reserved, http://dump3.sourceforge.net/<BR>
* <BR>
* This file is part of DuMP3.<BR>
* <BR>
* DuMP3 is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option) any later version.<BR>
* <BR>
* DuMP3 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public License for more details.<BR>
* <BR>
* You should have received a copy of the GNU General Public License along with DuMP3; if not, write to the Free Software Foundation, Inc., 51 Franklin St,
* Fifth Floor, Boston, MA 02110-1301 USA
*/
package net.za.grasser.duplicate.fingerprint;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import net.za.grasser.duplicate.file.FingerprintFile;
import net.za.grasser.duplicate.file.Status;
import net.za.grasser.duplicate.fingerprint.configure.ConfigFactory;
import net.za.grasser.duplicate.fingerprint.configure.SifFingerprintConfig;
import net.za.grasser.duplicate.util.HexArray;
import org.apache.log4j.Logger;
/**
* This class is used to calculate a fingerprint for a text file.<BR>
* It does this by finding a 'sif' fingerprints in the file.<BR>
* Limit the fingerprint length to x fingerprints per file.<BR>
* When the file is compared to another, a Jaccard coefficient is calculated on this set and a % match is returned.<BR>
* REF: <a href="http://webglimpse.net/pubs/TR93-33.pdf">Finding Similar Files in a Large File System</a>, Udi Manber, Oct 1993<BR>
*
* @author <a href="http://sourceforge.net/sendmessage.php?touser=733840">pyropunk at sourceforge dot net</a>
* @version $Revision: 1.11 $
* @modelguid {87AC7397-9A44-4C1C-B351-255DFA8ACEA3}
*/
public class SifFingerprint extends AbstractFingerprint {
/**
* <code>log</code> SubstringFingerprint -
*/
private static final Logger log = Logger.getLogger(SifFingerprint.class);
/**
* <code>parr</code> SifFingerprint - precalculated array of (p^i) mod M
*/
protected static int[] parr = null;
/**
* <code>arr</code> SifFingerprint - precalculated array of (i*p^49) mod M
*/
protected static int[] arr = null;
/**
* <code>ANCHOR_LENGTH</code> SifFingerprint - substring length<BR>
*/
protected static int ANCHOR_LENGTH = 50;
/**
* <code>M</code> SifFingerprint - Modulus currently set to 2^30<BR>
*/
protected static long M = 1073741824L;
/**
* <code>p</code> SifFingerprint - a prime - kept low for calc speed<BR>
*/
protected static int p = 11;
/**
* <code>k</code> SifFingerprint - number of bits to rip off (keep static)
*/
protected static final int k = 8;
/**
* <code>config</code> SifFingerprint -
*/
private static final SifFingerprintConfig config = (SifFingerprintConfig)ConfigFactory.getConfig(SifFingerprint.class);
/**
* certain things can be precomputed since (a mod x)(b mod x) = (ab) mod x<BR>
* (p^i) mod M - for all i between 0 and 49<BR>
* (i*p^49) mod M - for all i between 0 and 255<BR>
*/
static {
parr = new int[ANCHOR_LENGTH];
for (int i = 0; i < parr.length; i++) {
parr[i] = (new BigInteger("" + p)).pow(i).mod(new BigInteger("" + M)).intValue();
}
arr = new int[256];
for (int i = 0; i < arr.length; i++) {
arr[i] = (new BigInteger("" + i)).multiply(new BigInteger("" + parr[49])).mod(new BigInteger("" + M)).intValue();
}
}
/**
* calculates a fingerprint hash set from a byte[] <BR>
* assumes k = 8 and a LittleEndian architecture!
*
* @param pFinger byte[]
* @return HashSet
*/
public static HashSet<Integer> makeFingermapFromPrint(final byte[] pFinger) {
final HashSet<Integer> fingerset = new HashSet<Integer>();
for (int i = 0; i < pFinger.length; i += 3) {
int temp = pFinger[i + 2] & 0xff;
temp += pFinger[i + 1] << 8 & 0xff00;
temp += pFinger[i] << 16 & 0xff0000;
fingerset.add(new Integer(temp));
}
return fingerset;
}
/**
* calculates a byte[] from a fingerprint hash set<Br>
* assumes k = 8 and a LittleEndian architecture!<BR>
* adjusts the hashset to have values with 8 high bits set to 0
*
* @param pHash HashSet
* @return byte[]
*/
public static byte[] makeFingerprintFromHash(final HashSet<Integer> pHash) {
String fp = "";
final HashSet<Integer> s = new HashSet<Integer>();
Integer bi = null;
final List<Integer> list = new ArrayList<Integer>(pHash);
Collections.sort(list);
// limit fingerprint length to FINGER_LENGTH smallest hashes
int i = 0;
final Iterator<Integer> it = list.iterator();
while (it.hasNext() && i < config.getFingerLength()) {
bi = it.next();
String str = null;
final int temp = bi.intValue() & 0x7fffffff;
if (temp < 0x00ffffff) {
s.add(new Integer((temp & 0x00ffffff)));
str = "000000" + Integer.toHexString((temp & 0x00ffffff));
} else {
s.add(new Integer((bi.intValue() >> k & 0x00ffffff)));
str = "000000" + Integer.toHexString((bi.intValue() >> k & 0x00ffffff));
}
fp += str.substring(str.length() - 6, str.length());
i++;
}
pHash.clear();
pHash.addAll(s);
// make fingerprint
return HexArray.makeBytes(fp);
}
/**
* <code>fingerfound</code> SifFingerprint - found a fingerprint with last k bits = 0
*/
protected boolean fingerfound = false;
/**
* <code>fingermap</code> SubstringFingerprint - hashset of modulus fingerprints
*/
HashSet<Integer> fingermap = null;
/**
* <code>last</code> SubstringFingerprint - last number of bytes so we can continue calculating<BR>
*/
byte[] last = null;
/**
* @param fi FingerprintFile
* @modelguid {53F63DBA-703A-4B81-882B-F1B4318D5CC9}
*/
public SifFingerprint(final FingerprintFile fi) {
super(fi);
}
/**
* add a fingerprint to the set<BR>
* once a fingerprint with last k bits = 0 is found, clear the set and only add fingerprints with last k bits = 0<BR>
* in theory this limits the set to +-255 entries and saves space.
*
* @param pInt BigInteger
*/
private void add(final BigInteger pInt) {
if (fingerfound) {
if (pInt.getLowestSetBit() >= k) {
fingermap.add(new Integer(pInt.intValue()));
}
} else {
if (pInt.getLowestSetBit() >= k) {
fingermap.clear();
}
fingermap.add(new Integer(pInt.intValue()));
}
}
/**
* do 'Jaccard coefficient' calculation on set of values<BR>
* TODO: maybe adjust by the number of occurrences
*
* @param fi
* @return int - percentage
*/
private float compare(final SifFingerprint fi) {
final Set<Integer> setU = new HashSet<Integer>();
final Set<Integer> setI = new HashSet<Integer>();
setU.addAll(fingermap);
setU.addAll(fi.fingermap);
setI.addAll(fingermap);
setI.retainAll(fi.fingermap);
return (float)setI.size() / (float)setU.size() * 100.0f;
}
/**
* @return String
* @modelguid {E440866E-6C37-49CC-8B8A-E6645E19C3B1}
*/
@Override
public String getClassName() {
return super.getClassName() + '-' + config.getFingerLength();
}
/**
* @see net.za.grasser.duplicate.fingerprint.AbstractFingerprint#getSimilarityThreshhold()
*/
@Override
public float getSimilarityThreshhold() {
return config.getSimilarityThreshhold();
}
/**
* @param fi AbstractFingerprint
* @return boolean
* @modelguid {3E6D5504-3A75-4444-BE5B-C23BB0DF4DE8}
*/
@Override
public float matches(final AbstractFingerprint fi) {
final FingerprintFile ff1 = getFile();
final FingerprintFile ff2 = fi.getFile();
if (ff1.getStatus() != Status.FILE_OK && ff1.getStatus() != Status.FILE_SIGNATURE_MISMATCH || ff2.getStatus() != Status.FILE_OK
&& ff2.getStatus() != Status.FILE_SIGNATURE_MISMATCH) {
return 0.0f;
}
if (ff1.getLength() == 0L && ff2.getLength() == 0L) {
return 100.0f;
}
if (fi instanceof SifFingerprint) {
final SifFingerprint tf = (SifFingerprint)fi;
// try to read the files
getFingerprint();
tf.getFingerprint();
if ((ff1.getStatus() == Status.FILE_OK || ff1.getStatus() == Status.FILE_SIGNATURE_MISMATCH)
&& (ff2.getStatus() == Status.FILE_OK || ff2.getStatus() == Status.FILE_SIGNATURE_MISMATCH)) {
if (fingermap == null) {
fingermap = makeFingermapFromPrint(getFingerprint());
}
if (tf.fingermap == null) {
tf.fingermap = makeFingermapFromPrint(tf.getFingerprint());
}
final float res = compare(tf);
if (log.isTraceEnabled()) {
log.trace(ff1.getPath() + "=" + res + "=" + ff2.getPath());
}
return res;
}
}
return 0.0f;
}
/**
* teardown phase after reading the file<BR>
* fingerprint must be computed here
*/
@Override
protected void postRead() {
fingerprint = makeFingerprintFromHash(fingermap);
// don't waste space
last = null;
}
/**
* setup phase before reading the file
*/
@Override
protected void preRead() {
fingermap = new HashSet<Integer>();
}
/**
* calculate the fingerprints and add them to the fingerprint map
*
* @param pBuffer
* @param pLength
*/
@Override
protected void update(final byte[] pBuffer, final int pLength) {
if (pLength >= ANCHOR_LENGTH) {
BigInteger temp = new BigInteger("0");
if (last == null) {
// F1 = (t0*p^49 + t1*p^48 + ... + t49) mod M
for (int i = 0; i < ANCHOR_LENGTH; i++) {
temp = temp.add((new BigInteger("" + pBuffer[i])).multiply(new BigInteger("" + parr[ANCHOR_LENGTH - i - 1])));
}
temp = temp.mod(new BigInteger("" + M));
add(temp);
} else {
// continue calculating
for (int i = 0; i < ANCHOR_LENGTH; i++) {
final int ch = last[i] & 0xFF;
temp = temp.multiply(new BigInteger("" + p)).add(new BigInteger("" + pBuffer[i])).subtract(new BigInteger("" + arr[ch]));
temp = temp.mod(new BigInteger("" + M));
add(temp);
}
}
// F2 = (p*F1 + t50 - t0*p^49) mod M
for (int i = ANCHOR_LENGTH; i < pLength - ANCHOR_LENGTH; i++) {
final int ch = pBuffer[(i - ANCHOR_LENGTH)] & 0xFF;
temp = temp.multiply(new BigInteger("" + p)).add(new BigInteger("" + pBuffer[i])).subtract(new BigInteger("" + arr[ch]));
temp = temp.mod(new BigInteger("" + M));
add(temp);
}
int j = 0;
last = new byte[ANCHOR_LENGTH];
for (int i = pLength - ANCHOR_LENGTH; i < pLength; i++) {
last[j++] = pBuffer[i];
}
} else {
// file too short for normal fingerprinting
BigInteger temp = new BigInteger("0");
// F1 = (t0*p^49 + t1*p^48 + ... + tn*p^(49-len)) mod M
for (int i = 0; i < pLength; i++) {
temp = temp.add((new BigInteger("" + pBuffer[i])).multiply(new BigInteger("" + parr[ANCHOR_LENGTH - i - 1])));
}
temp = temp.mod(new BigInteger("" + M));
add(temp);
}
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?