📄 hash.cpp
字号:
case 8 : b+=k[1]; a+=k[0]; break;
case 7 : b+=((UINT32)k8[6])<<16; /* fall through */
case 6 : b+=((UINT32)k8[5])<<8; /* fall through */
case 5 : b+=k8[4]; /* fall through */
case 4 : a+=k[0]; break;
case 3 : a+=((UINT32)k8[2])<<16; /* fall through */
case 2 : a+=((UINT32)k8[1])<<8; /* fall through */
case 1 : a+=k8[0]; break;
case 0 : return c;
}
#endif /* !valgrind */
} else if ((u.i & 0x1) == 0) {
const UINT16 *k = (const UINT16 *)key; /* read 16-bit chunks */
const UINT8 *k8;
/*--------------- all but last block: aligned reads and different mixing */
while (length > 12)
{
a += k[0] + (((UINT32)k[1])<<16);
b += k[2] + (((UINT32)k[3])<<16);
c += k[4] + (((UINT32)k[5])<<16);
mix(a,b,c);
length -= 12;
k += 6;
}
/*----------------------------- handle the last (probably partial) block */
k8 = (const UINT8 *)k;
switch(length)
{
case 12: c+=k[4]+(((UINT32)k[5])<<16);
b+=k[2]+(((UINT32)k[3])<<16);
a+=k[0]+(((UINT32)k[1])<<16);
break;
case 11: c+=((UINT32)k8[10])<<16; /* fall through */
case 10: c+=k[4];
b+=k[2]+(((UINT32)k[3])<<16);
a+=k[0]+(((UINT32)k[1])<<16);
break;
case 9 : c+=k8[8]; /* fall through */
case 8 : b+=k[2]+(((UINT32)k[3])<<16);
a+=k[0]+(((UINT32)k[1])<<16);
break;
case 7 : b+=((UINT32)k8[6])<<16; /* fall through */
case 6 : b+=k[2];
a+=k[0]+(((UINT32)k[1])<<16);
break;
case 5 : b+=k8[4]; /* fall through */
case 4 : a+=k[0]+(((UINT32)k[1])<<16);
break;
case 3 : a+=((UINT32)k8[2])<<16; /* fall through */
case 2 : a+=k[0];
break;
case 1 : a+=k8[0];
break;
case 0 : return c; /* zero length requires no mixing */
}
} else { /* need to read the key one byte at a time */
const UINT8 *k = (const UINT8 *)key;
/*--------------- all but the last block: affect some 32 bits of (a,b,c) */
while (length > 12)
{
a += k[0];
a += ((UINT32)k[1])<<8;
a += ((UINT32)k[2])<<16;
a += ((UINT32)k[3])<<24;
b += k[4];
b += ((UINT32)k[5])<<8;
b += ((UINT32)k[6])<<16;
b += ((UINT32)k[7])<<24;
c += k[8];
c += ((UINT32)k[9])<<8;
c += ((UINT32)k[10])<<16;
c += ((UINT32)k[11])<<24;
mix(a,b,c);
length -= 12;
k += 12;
}
/*-------------------------------- last block: affect all 32 bits of (c) */
switch(length) /* all the case statements fall through */
{
case 12: c+=((UINT32)k[11])<<24;
case 11: c+=((UINT32)k[10])<<16;
case 10: c+=((UINT32)k[9])<<8;
case 9 : c+=k[8];
case 8 : b+=((UINT32)k[7])<<24;
case 7 : b+=((UINT32)k[6])<<16;
case 6 : b+=((UINT32)k[5])<<8;
case 5 : b+=k[4];
case 4 : a+=((UINT32)k[3])<<24;
case 3 : a+=((UINT32)k[2])<<16;
case 2 : a+=((UINT32)k[1])<<8;
case 1 : a+=k[0];
break;
case 0 : return c;
}
}
final(a,b,c);
return c;
}
// === Collision resolution ====
typedef UINT (*HASH_FUNC)(const CHAR*, SIZE_T);
#define MAX_TABLE_SIZE 8209
#define MAX_TEXT_LEN (MAX_TABLE_SIZE * 16)
CHAR* g_lines[MAX_TABLE_SIZE];
CHAR g_text[MAX_TEXT_LEN];
UINT g_collisions, g_lines_count, g_table_size_mask;
UINT g_table_size, g_table_size_magic, g_table_size_shift;
#ifdef USE_PRIME
UINT inline mod_table_size(UINT x) {
UINT mul_hi_dword = (UINT)( __emulu(x, g_table_size_magic) >> 32 );
UINT ret = x - (mul_hi_dword >> g_table_size_shift) * g_table_size;
assert(ret == x % g_table_size);
return ret;
}
#endif
// =================
// Separate chaining
#ifdef USE_CHAINING
struct CHAIN {
const CHAR* key;
CHAIN* next;
};
CHAIN* g_table[MAX_TABLE_SIZE];
CHAIN g_chains[MAX_TABLE_SIZE];
UINT g_next_free_chain;
void InitHashTable() {
g_next_free_chain = 0;
g_collisions = 0;
memset(g_table, 0, sizeof(g_table));
}
BOOL Lookup(const CHAR* key, SIZE_T len, HASH_FUNC hash_func, BOOL add) {
#ifdef USE_PRIME
UINT hash = mod_table_size( hash_func(key, len) );
#else
UINT hash = hash_func(key, len) & g_table_size_mask;
#endif
// Look up the value in the chain
UINT collisions = 0;
for(CHAIN* chain = g_table[hash]; chain != NULL; chain = chain->next) {
if( memcmp(chain->key, key, len * sizeof(CHAR)) == 0 )
return TRUE;
collisions++;
}
// If it was not found, add it
if(add) {
CHAIN* next = g_table[hash];
g_table[hash] = &g_chains[g_next_free_chain++];
g_table[hash]->next = next;
g_table[hash]->key = key;
g_collisions += collisions;
}
return FALSE;
}
// =================
// Open Addressing
#else
struct ITEM {
const CHAR* key;
#ifdef STORE_32
UINT hash32;
#endif
};
ITEM g_table[MAX_TABLE_SIZE];
void InitHashTable() {
g_collisions = 0;
memset(g_table, 0, sizeof(g_table));
}
BOOL Lookup(const CHAR* key, SIZE_T len, HASH_FUNC hash_func, BOOL add) {
UINT hash32 = hash_func(key, len);
#ifdef USE_PRIME
UINT hash = mod_table_size( hash32 );
#else
UINT hash = hash32 & g_table_size_mask;
#endif
// Look up the value in the table
UINT collisions = 0;
while(g_table[hash].key != 0) {
#ifdef STORE_32
if( g_table[hash].hash32 == hash32 &&
memcmp(g_table[hash].key, key, len * sizeof(CHAR)) == 0 )
return TRUE;
#else
if( memcmp(g_table[hash].key, key, len * sizeof(CHAR)) == 0 )
return TRUE;
#endif
#ifdef USE_PRIME
hash = mod_table_size(hash + 1);
#else
hash = (hash + 1) & g_table_size_mask;
#endif
collisions++;
}
// If it was not found, add it
if(add) {
g_table[hash].key = key;
#ifdef STORE_32
g_table[hash].hash32 = hash32;
#endif
g_collisions += collisions;
}
return FALSE;
}
#endif
// ======= Benchmarks ============
UINT64 inline GetRDTSC(void) {
__asm {
; Flush the pipeline
XOR eax, eax
CPUID
; Get RDTSC counter in edx:eax
RDTSC
}
}
void BenchmarkHashFunction( HASH_FUNC hash_func, const CHAR* name ) {
printf("%20s: ", name);
INT min_time = INT_MAX;
for(INT k = 0; k < 5; k++) {
InitHashTable();
UINT64 start = GetRDTSC();
for(UINT i = 0; i < g_lines_count; ++i)
Lookup(g_lines[i], strlen(g_lines[i]), hash_func, TRUE);
for(UINT i = 0; i < g_lines_count; ++i)
Lookup(g_lines[i], strlen(g_lines[i]), hash_func, FALSE);
UINT64 time = GetRDTSC() - start;
printf("%10d", (INT)(time + 500) / 1000);
if(time < min_time)
min_time = (INT)time;
}
printf("|%10d", (INT)(min_time + 500) / 1000);
printf(" [%5d]\n", g_collisions);
}
void PrintTestWords(const HASH_FUNC hash_func[], const CHAR* name[], SIZE_T nfunc) {
static const CHAR* word[] = {"too", "top", "tor", "tpp", "a000", "a001", "a002", "a003",
"a004", "a005", "a006", "a007", "a008", "a009", "a010", "a", "aa", "aaa"
};
// Header
printf( "%4c", ' ' );
for(UINT j = 0; j < nfunc; j++) {
CHAR func_name[16];
strncpy(func_name, name[j], 9);
func_name[9] = '\0';
printf( "%10s", func_name );
}
printf( "\n" );
for(UINT i = 0; i < NUM_OF(word); i++) {
printf( "%4s", word[i] );
for(UINT j = 0; j < nfunc; j++)
printf( "%10x", hash_func[j](word[i], strlen(word[i])) );
printf( "\n" );
}
}
UINT NextPowerOfTwo(UINT x) {
// Henry Warren, "Hacker's Delight", ch. 3.2
x--;
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
return x + 1;
}
UINT NextLog2(UINT x) {
// Henry Warren, "Hacker's Delight", ch. 5.3
if(x <= 1) return x;
x--;
UINT n = 0;
UINT y;
y = x >>16; if(y) {n += 16; x = y;}
y = x >> 8; if(y) {n += 8; x = y;}
y = x >> 4; if(y) {n += 4; x = y;}
y = x >> 2; if(y) {n += 2; x = y;}
y = x >> 1; if(y) return n + 2;
return n + x;
}
int main(int argc, char* argv[]) {
CRC32Init();
// Check value from the guide by Ross N. Williams
assert( CRC32("123456789", 9) == 0xCBF43926 );
if(argc <= 1) {
puts("Usage: hash text-file");
return 1;
}
// Read the file line-by-line into g_text buffer
FILE* file = fopen(argv[1], "r");
if(!file) {
puts("Can't open the file");
return 1;
}
CHAR* text = g_text;
SIZE_T remaining = MAX_TEXT_LEN;
UINT i = 0;
while( fgets(text, (UINT)remaining, file) && i < MAX_TABLE_SIZE / 2 ) {
SIZE_T len = strlen(text);
if(text[len-1] == '\n')
text[--len] = '\0';
if(len) {
g_lines[i++] = text;
text += len + 1;
remaining -= len + 1;
}
}
fclose(file);
g_lines_count = i;
#ifdef USE_PRIME
// 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192
UINT closest_prime[] = {2, 2, 5, 11, 17, 37, 67, 131, 257, 521, 1031, 2053, 4099, 8209,
16411, 32771, 65537, 131101, 262147, 524309, 1048583, 2097169,
4194319, 8388617, 16777259, 33554467, 67108879, 134217757,
268435459, 536870923, 1073741827};
UINT magic[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x7f218557, 0, 0xffd008ff, 0x7fbc240d};
UINT shift[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 12, 12};
UINT log = NextLog2(i) + 1;
g_table_size = closest_prime[ log ];
g_table_size_magic = magic[ log ];
g_table_size_shift = shift[ log ];
if( g_table_size_magic == 0 )
printf("table size not supported: %d", g_table_size);
printf("%d lines read\n" "%d elements in the table\n",
g_lines_count, g_table_size);
#else
g_table_size_mask = NextPowerOfTwo(i) * 2 - 1;
printf("%d lines read\n" "%d elements in the table\n",
g_lines_count, g_table_size_mask + 1);
#endif
// Run the benchmarks
static const HASH_FUNC func[] = {HashBernstein, HashKernighanRitchie,
Hash17_unrolled, Hash65599, HashFNV1a, HashUniversal,
HashWeinberger, HashLarson,
HashPaulHsieh, HashOneAtTime, HashLookup3, HashArashPartow,
CRC32, HashRamakrishna, HashFletcher, HashMurmur2 };
static const CHAR* name[] = {"Bernstein", "Kernighan&Ritchie",
"x17 unrolled", "x65599", "FNV-1a", "universal",
"Weinberger", "HashLarson",
"Paul Hsieh", "One At Time", "lookup3", "Arash Partow",
"CRC-32", "Ramakrishna", "Fletcher", "Murmur2" };
assert(NUM_OF(name) == NUM_OF(func));
//PrintTestWords(func + 6, name + 6, NUM_OF(func) - 6);
for(UINT i = 0; i < NUM_OF(func); i++)
BenchmarkHashFunction(func[i], name[i]);
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -