substringfingerprint.java
来自「dump3 morpheus 0.2.9 src」· Java 代码 · 共 351 行
JAVA
351 行
/**
* DuMP3 version morpheus_0.2.9 - a duplicate/similar file finder in Java<BR>
* Copyright 2005 Alexander Grässer<BR>
* All Rights Reserved, http://dump3.sourceforge.net/<BR>
* <BR>
* This file is part of DuMP3.<BR>
* <BR>
* DuMP3 is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option) any later version.<BR>
* <BR>
* DuMP3 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public License for more details.<BR>
* <BR>
* You should have received a copy of the GNU General Public License along with DuMP3; if not, write to the Free Software Foundation, Inc., 51 Franklin St,
* Fifth Floor, Boston, MA 02110-1301 USA
*/
package net.za.grasser.duplicate.fingerprint;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import net.za.grasser.duplicate.file.FingerprintFile;
import net.za.grasser.duplicate.file.Status;
import net.za.grasser.duplicate.fingerprint.configure.ConfigFactory;
import net.za.grasser.duplicate.fingerprint.configure.SubstringFingerprintConfig;
import net.za.grasser.duplicate.util.Constants;
import org.apache.log4j.Logger;
/**
* This class is used to calculate a fingerprint for a text file.<BR>
* It does this by finding a set of the n rarest (m character) substrings in the file.<BR>
* When the file is compared to another, a Jaccard coefficient is calculated on this set and a % match is returned.<BR>
* REF: <a
* href="http://www.cs.tau.ac.il/~matias/courses/Seminar_Spring03/Estimating%20Rarity%20and%20Similarity%20over%20Data%20stream%20Windows.ppt">Estimating rarity
* and similarity over Windowed data streams</a>, Yossi Matias, Spring 2003<BR>
* REF: <a href="http://www-2.cs.cmu.edu/afs/cs/user/nch/www/koala/main.html">Scalable Document Fingerprinting (Extended Abstract)</a>, Nevin Heintze, Oct.
* 1996<BR>
*
* @author <a href="http://sourceforge.net/sendmessage.php?touser=733840">pyropunk at sourceforge dot net</a>
* @version $Revision: 1.11 $
* @modelguid {87AC7397-9A44-4C1C-B351-255DFA8ACEA3}
*/
public class SubstringFingerprint extends AbstractFingerprint {
/**
* <code>log</code> SubstringFingerprint -
*/
private static final Logger log = Logger.getLogger(SubstringFingerprint.class);
/**
* This class is used internally for the entries into the fingerprint map.
*
* @author <a href="http://sourceforge.net/sendmessage.php?touser=733840">pyropunk at sourceforge dot net</a>
* @version $Revision: 1.11 $
*/
static class Entry implements Comparable<Object> {
/**
* <code>key</code> Entry -
*/
public String key = null;
/**
* <code>value</code> Entry -
*/
public int value = 0;
/**
* Constructor
*
* @param pBuf
* @param pOff
*/
Entry(final byte[] pBuf, final int pOff) {
key = new String(pBuf, pOff, ANCHOR_LENGTH);
value = 0;
for (int i = 0; i < Constants.SIZE_OF_INT; i++) {
value += (pBuf[(pOff + ANCHOR_LENGTH + i)] & 0xFF) << (i << 3);
}
}
/**
* Constructor
*
* @param pKey
* @param pVal
*/
Entry(final String pKey, final int pVal) {
key = pKey;
value = pVal;
}
/**
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
public int compareTo(final Object arg0) {
if (arg0 instanceof Entry) {
return value - ((Entry)arg0).value;
}
throw new ClassCastException();
}
/**
* @see java.lang.Object#equals(java.lang.Object)
*/
@Override
public boolean equals(final Object arg0) {
return compareTo(arg0) == 0;
}
/**
* @return byte[]
*/
public byte[] getBytes() {
final byte[] ret = new byte[(ANCHOR_LENGTH + Constants.SIZE_OF_INT)];
final byte[] a = key.getBytes();
for (int i = 0; i < ANCHOR_LENGTH; i++) {
ret[i] = a[i];
}
for (int i = 0; i < Constants.SIZE_OF_INT; i++) {
ret[(ANCHOR_LENGTH + i)] = (byte)(value >> (i << 3) & 0xFF);
}
return ret;
}
/**
* increment value
*/
public void inc() {
value++;
}
/**
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
return key + '#' + value;
}
}
/**
* <code>ANCHOR_LENGTH</code> SubstringFingerprint - substring length
*/
protected static int ANCHOR_LENGTH = 6;
/**
* <code>HASH_LENGTH</code> SubstringFingerprint - fingerprint subsection length
*/
protected static int HASH_LENGTH = ANCHOR_LENGTH + Constants.SIZE_OF_INT;
/**
* <code>config</code> SubstringFingerprint -
*/
private static final SubstringFingerprintConfig config = (SubstringFingerprintConfig)ConfigFactory.getConfig(SubstringFingerprint.class);
/**
* make fingerprint hash from map<BR>
* using least used substring hashing
*
* @param list List
* @return byte[]
*/
private static byte[] makeHash(final List<Entry> list) {
final int l = list.size() > config.getFingerLength() ? config.getFingerLength() : list.size();
final byte[] fprint = new byte[(l * HASH_LENGTH)];
for (int i = 0; i < l; i++) {
final byte[] a = list.get(i).getBytes();
for (int j = 0; j < HASH_LENGTH; j++) {
fprint[i * HASH_LENGTH + j] = a[j];
}
}
return fprint;
}
/**
* reconstruct map from fingerprint hash
*
* @param buffer
* @return HashMap
*/
protected static HashMap<String, Entry> makeMap(final byte[] buffer) {
final HashMap<String, Entry> ret = new HashMap<String, Entry>();
for (int i = 0; i < buffer.length / HASH_LENGTH; i++) {
final Entry in = new Entry(buffer, (i * HASH_LENGTH));
ret.put(in.key, in);
}
return ret;
}
/**
* <code>fingermap</code> SubstringFingerprint - hastable of substrings to number of occurences
*/
HashMap<String, Entry> fingermap = null;
/**
* <code>pos</code> SubstringFingerprint - position in lastSeen array.
*/
int pos = 0;
/**
* <code>lastSeen</code> SubstringFingerprint -
*/
String[] lastSeen = null;
/**
* @param fi FingerprintFile
* @modelguid {53F63DBA-703A-4B81-882B-F1B4318D5CC9}
*/
public SubstringFingerprint(final FingerprintFile fi) {
super(fi);
}
/**
* do 'Jaccard coefficient' calculation on set of values<BR>
* TODO: maybe adjust by the number of occurrences
*
* @param fi
* @return int - percentage
*/
private float compare(final SubstringFingerprint fi) {
final Set<String> setU = new HashSet<String>();
final Set<String> setI = new HashSet<String>();
setU.addAll(fingermap.keySet());
setU.addAll(fi.fingermap.keySet());
setI.addAll(fingermap.keySet());
setI.retainAll(fi.fingermap.keySet());
return (float)setI.size() / (float)setU.size() * 100.0f;
}
/**
* @return String
* @modelguid {E440866E-6C37-49CC-8B8A-E6645E19C3B1}
*/
@Override
public String getClassName() {
return super.getClassName() + '-' + config.getFingerLength();
}
/**
* @see net.za.grasser.duplicate.fingerprint.AbstractFingerprint#getSimilarityThreshhold()
*/
@Override
public float getSimilarityThreshhold() {
return config.getSimilarityThreshhold();
}
/**
* @param fi AbstractFingerprint
* @return boolean
* @modelguid {3E6D5504-3A75-4444-BE5B-C23BB0DF4DE8}
*/
@Override
public float matches(final AbstractFingerprint fi) {
final FingerprintFile ff1 = getFile();
final FingerprintFile ff2 = fi.getFile();
if (ff1.getStatus() != Status.FILE_OK && ff1.getStatus() != Status.FILE_SIGNATURE_MISMATCH || ff2.getStatus() != Status.FILE_OK
&& ff2.getStatus() != Status.FILE_SIGNATURE_MISMATCH) {
return 0.0f;
}
if (ff1.getLength() == 0L && ff2.getLength() == 0L) {
return 100.0f;
}
if (fi instanceof SubstringFingerprint) {
final SubstringFingerprint tf = (SubstringFingerprint)fi;
// try to read the files
getFingerprint();
tf.getFingerprint();
if ((ff1.getStatus() == Status.FILE_OK || ff1.getStatus() == Status.FILE_SIGNATURE_MISMATCH)
&& (ff2.getStatus() == Status.FILE_OK || ff2.getStatus() == Status.FILE_SIGNATURE_MISMATCH)) {
if (fingermap == null) {
fingermap = makeMap(getFingerprint());
}
if (tf.fingermap == null) {
tf.fingermap = makeMap(tf.getFingerprint());
}
final float res = compare(tf);
if (log.isTraceEnabled()) {
log.trace(ff1.getPath() + "=" + res + "=" + ff2.getPath());
}
return res;
}
}
return 0.0f;
}
/**
* teardown phase after reading the file<BR>
* fingerprint must be computed here
*/
@Override
protected void postRead() {
// don't waste space
lastSeen = null;
// determine least occuring strings
final List<Entry> list = new ArrayList<Entry>();
final Iterator<Entry> it = fingermap.values().iterator();
while (it.hasNext()) {
list.add(it.next());
}
Collections.sort(list);
if (list.size() > config.getFingerLength()) {
for (int i = config.getFingerLength(); i < list.size(); i++) {
fingermap.remove(list.get(i).key);
}
}
// make fingerprint
fingerprint = makeHash(list);
}
/**
* setup phase before reading the file
*/
@Override
protected void preRead() {
fingermap = new HashMap<String, Entry>();
lastSeen = new String[ANCHOR_LENGTH];
}
/**
* add substrings to the fingerprint map
*
* @param pBuffer
* @param pLength
*/
@Override
protected void update(final byte[] pBuffer, final int pLength) {
String s = null;
for (int i = 0; i < pLength - ANCHOR_LENGTH; i++) {
s = new String(pBuffer, i, ANCHOR_LENGTH);
boolean notfound = true;
for (int j = 0; j < ANCHOR_LENGTH; j++) {
final int p = (j + pos) % ANCHOR_LENGTH;
if (lastSeen[p] != null && s.equals(lastSeen[p])) {
notfound = false;
break;
}
}
pos = (pos + 1) % ANCHOR_LENGTH;
if (notfound) {
lastSeen[pos] = s;
final Object val = fingermap.get(s);
if (val == null) {
fingermap.put(s, new Entry(s, 1));
} else {
((Entry)val).inc();
}
} else {
lastSeen[pos] = null;
}
}
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?