findhash.c
来自「开放源码的编译器open watcom 1.6.0版的源代码」· C语言 代码 · 共 1,153 行 · 第 1/3 页
C
1,153 行
if( fscanf( fp, "%s %s\n", keyword, class ) != 2 )
break;
if( size > MAX_KEYWORDS ) {
fatal( "too many keywords" );
}
key_len = strlen( keyword );
if( key_len < min_len ) {
min_len = key_len;
}
if( key_len > max_len ) {
max_len = key_len;
}
if( key_len >= ( CHAR_BIT * sizeof( len_mask ) ) ) {
fatal( "keyword is too long" );
}
len_mask |= ( 1L << key_len );
position[size] = size;
tokens[size] = strdup( keyword );
if( strcmp( class, "TC_A?" ) == 0 ) {
sprintf( class, "TC_A%u", 3 - ( key_len & 0x03 ) );
}
token_class[size] = strdup( class );
++size;
}
if( !feof( fp ) ) {
fatal( "invalid token file" );
}
fclose( fp );
}
num_keywords = size - 1;
if( num_keywords == 0 ) {
fatal( "no keywords" );
}
qsort( &position[1], num_keywords, sizeof( int ), cmptok );
for( i = 1; i < num_keywords; ++i ) {
for( j = i; j < num_keywords; ++j ) {
if( position[j] == i ) {
break;
}
}
k = position[i];
swap( &tokens[k], &tokens[i] );
swap( &token_class[k], &token_class[i] );
position[j] = k;
}
col = 1;
for( i = 1; i <= num_keywords; ++i ) {
tok_len = strlen( tokens[i] );
col += tok_len + 1;
if( col > 79 ) {
output( "\n" );
col = tok_len + 2;
}
output( "%s ", tokens[i] );
}
output( "\n%u keywords min_len=%u max_len=%u\n", num_keywords, min_len, max_len );
}
void init_arrays( size_t first_index, size_t last_index )
/*******************************************************/
{
keyword_t i;
letter_t c;
for( c = LETTER_MIN; c <= LETTER_MAX; ++c ) {
freq[ c ] = 0;
}
for( i = 1; i <= num_keywords; ++i ) {
len[ i ] = strlen( tokens[ i ] );
if( len[i] > first_index ) {
first[i] = make_letter( tokens[i][first_index] );
} else {
first[i] = make_letter( tokens[i][ len[i] - 1 ] );
}
if( len[i] > last_index ) {
last[i] = make_letter( tokens[i][ len[i] - ( last_index + 1 ) ] );
} else {
last[i] = make_letter( tokens[i][0] );
}
done[ i ] = 0;
// these improve the hash function
init_hash[ i ] = len[ i ] + tokens[i][ min_len ];
//init_hash[ i ] = len[ i ] + tokens[i][ len[i] >> 1 ];
//init_hash[ i ] = len[ i ];
hash[ i ] = init_hash[ i ];
++freq[ first[ i ] ];
++freq[ last[ i ] ];
}
for( i = 1; i <= num_keywords; ++i ) {
ordered[ i ] = 0;
}
for( i = 0; i < hashsize; ++i ) {
used[ i ] = NULL_KEYWORD;
}
// weight 0 is never used so it can be used in weights[] to see
// what letters have weights
for( i = 1; i <= MAX_WEIGHTS; ++i ) {
available[ i ] = TRUE;
}
}
void sort_frequency( void )
/*************************/
{
letter_t previous;
letter_t after_c;
letter_t c;
char prt_c;
boolean change;
/* find the alphabetic character that occurs the most in the keywords */
most_used_character = LETTER_MIN;
for( c = LETTER_MIN; c <= LETTER_MAX; ++c ) {
if( c == LETTER__ )
continue;
if( freq[ c ] > freq[ most_used_character ] ) {
most_used_character = c;
}
}
/* make the next[] links into a ring */
for( c = LETTER_MIN; c < LETTER_MAX; ++c ) {
next[ c ] = c + 1;
}
next[ LETTER_MAX ] = LETTER_MIN;
/* sort the list of characters in descending order of frequency */
do {
change = FALSE;
c = most_used_character;
for(;;) {
previous = c;
c = next[ previous ];
after_c = next[ c ];
if( after_c == most_used_character )
break;
if( freq[ c ] < freq[ after_c ] ) {
/* exchange 'c' and 'after_c' */
next[ c ] = next[ after_c ];
next[ after_c ] = c;
next[ previous ] = after_c;
change = TRUE;
}
}
} while( change );
/* sort lists of equal frequency characters in ascending order */
do {
change = FALSE;
c = most_used_character;
for(;;) {
previous = c;
c = next[ previous ];
after_c = next[ c ];
if( after_c == most_used_character )
break;
if( freq[ c ] == freq[ after_c ] ) {
if( c > after_c ) {
/* exchange 'c' and 'after_c' */
next[ c ] = next[ after_c ];
next[ after_c ] = c;
next[ previous ] = after_c;
change = TRUE;
}
}
}
} while( change );
output( "frequency ordering of characters: " );
/* update the prev pointers to reflect the new ordering */
c = most_used_character;
do {
if( freq[ c ] != 0 ) {
prt_c = make_char( c );
if( isprint( prt_c ) ) {
output( "%c", prt_c );
} else {
output( "\\x%02x", prt_c );
}
}
after_c = next[ c ];
prev[ after_c ] = c;
c = after_c;
} while( c != most_used_character );
output( "\n" );
}
unsigned do_hash( unsigned x )
{
x &= hashmask-1;
if( x >= hashsize )
x -= hashsize;
return x;
}
void undo( letter_t c, keyword_t i )
/**********************************/
{
keyword_t index;
unsigned first_weight;
unsigned last_weight;
/*
every keyword that had a full hash value calculated
because of the weight of the character specified
must deregister its hash value position
*/
first_weight = first_scale * weights[c];
last_weight = last_scale * weights[c];
for( ; i > 0; --i ) {
if( first[ i ] == c ) {
--done[ i ];
if( done[ i ] == 1 ) { // 2 -> 1 transition
index = do_hash( hash[ i ] );
used[ index ] = NULL_KEYWORD;
}
hash[ i ] -= first_weight;
}
if( last[ i ] == c ) {
--done[ i ];
if( done[ i ] == 1 ) { // 2 -> 1 transition
index = do_hash( hash[ i ] );
used[ index ] = NULL_KEYWORD;
}
hash[ i ] -= last_weight;
}
}
}
boolean share_letter( letter_t c, letter_t *p1, letter_t *p2 )
/************************************************************/
{
keyword_t i;
unsigned h;
for( i = 0; i < hashsize; ++i ) {
partial[i] = NULL_KEYWORD;
}
for( i=1; i <= num_keywords; ++i ) {
if( done[i] == 1 ) {
if( p1[i] == c || p2[i] == c ) {
h = do_hash( hash[i] );
if( partial[h] != NULL_KEYWORD ) {
++collisions[i];
++collisions[ partial[h] ];
return( TRUE );
}
partial[h] = i;
}
}
}
return( FALSE );
}
boolean check( letter_t c )
/*************************/
{
/*
- select all keywords with one more weight to add
- if any keyword in this set shares a letter with
another keyword in the set and the hash values
are identical, the current set of weights cannot work
since adding the same weight to both hashes will
not make them different
*/
if( first_scale == 1 && last_scale == 1 ) {
while( freq[c] != 0 && c != most_used_character ) {
if( share_letter( c, first, last ) ) {
return( FALSE );
}
c = next[c];
}
} else {
while( freq[c] != 0 && c != most_used_character ) {
if( share_letter( c, first, first ) ) {
return( FALSE );
}
if( share_letter( c, last, last ) ) {
return( FALSE );
}
c = next[c];
}
}
return( TRUE );
}
boolean try_hash( letter_t c )
/****************************/
{
unsigned found;
keyword_t index;
keyword_t i;
boolean works;
unsigned first_weight;
unsigned last_weight;
unsigned adjust;
works = TRUE;
first_weight = first_scale * weights[c];
last_weight = last_scale * weights[c];
for( i = 1; i <= num_keywords; ++i ) {
if( first[ i ] == c ) {
if( last[ i ] == c ) {
adjust = first_weight + last_weight;
found = 2;
} else {
adjust = first_weight;
found = 1;
}
} else if( last[ i ] == c ) {
adjust = last_weight;
found = 1;
} else {
continue;
}
if( done[i] + found == 2 ) {
index = do_hash( hash[ i ] + adjust );
if( used[ index ] != NULL_KEYWORD ) {
works = FALSE;
break;
}
done[ i ] += found;
hash[ i ] += adjust;
used[ index ] = i;
} else {
done[ i ] += found;
hash[ i ] += adjust;
}
}
if( works == TRUE ) {
works = check( next[ c ] );
}
if( works == FALSE ) {
undo( c, i - 1 );
}
return( works );
}
unsigned next_weight( void )
/**************************/
{
int i;
for( i = 1; i <= MAX_WEIGHTS; ++i ) {
if( available[i] ) {
return( i );
}
}
return( 0 );
}
void try_for_hash( void )
/***********************/
{
boolean works;
char c;
keyword_t search;
c = most_used_character;
weights[ c ] = 1;
available[ weights[c] ] = FALSE;
do {
works = try_hash( c );
if( works ) {
available[ weights[ c ] ] = FALSE;
c = next[ c ];
if( c == most_used_character )
break;
search = next_weight();
if( search == 0 )
break;
weights[ c ] = search;
} else {
do {
while( weights[c] == num_keywords && c != most_used_character ){
c = prev[ c ];
available[ weights[c] ] = TRUE;
undo( c, num_keywords );
}
if( c == most_used_character )
break;
weights[ c ]++;
} while( ! available[ weights[c] ] );
}
} while( ( freq[ c ] != 0 ) && ( c != most_used_character ) );
while( c != most_used_character ) { /* initialize rest of weights */
search = next_weight();
if( search == 0 )
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?