findhash.c

来自「开放源码的编译器open watcom 1.6.0版的源代码」· C语言 代码 · 共 1,153 行 · 第 1/3 页

C
1,153
字号
            if( fscanf( fp, "%s %s\n", keyword, class ) != 2 )
                break;
            if( size > MAX_KEYWORDS ) {
                fatal( "too many keywords" );
            }
            key_len = strlen( keyword );
            if( key_len < min_len ) {
                min_len = key_len;
            }
            if( key_len > max_len ) {
                max_len = key_len;
            }
            if( key_len >= ( CHAR_BIT * sizeof( len_mask ) ) ) {
                fatal( "keyword is too long" );
            }
            len_mask |= ( 1L << key_len );
            position[size] = size;
            tokens[size] = strdup( keyword );
            if( strcmp( class, "TC_A?" ) == 0 ) {
                sprintf( class, "TC_A%u", 3 - ( key_len & 0x03 ) );
            }
            token_class[size] = strdup( class );
            ++size;
        }
        if( !feof( fp ) ) {
            fatal( "invalid token file" );
        }
        fclose( fp );
    }
    num_keywords = size - 1;
    if( num_keywords == 0 ) {
        fatal( "no keywords" );
    }
    qsort( &position[1], num_keywords, sizeof( int ), cmptok );
    for( i = 1; i < num_keywords; ++i ) {
        for( j = i; j < num_keywords; ++j ) {
            if( position[j] == i ) {
                break;
            }
        }
        k = position[i];
        swap( &tokens[k], &tokens[i] );
        swap( &token_class[k], &token_class[i] );
        position[j] = k;
    }
    col = 1;
    for( i = 1; i <= num_keywords; ++i ) {
        tok_len = strlen( tokens[i] );
        col += tok_len + 1;
        if( col > 79 ) {
            output( "\n" );
            col = tok_len + 2;
        }
        output( "%s ", tokens[i] );
    }
    output( "\n%u keywords min_len=%u max_len=%u\n", num_keywords, min_len, max_len );
}

void init_arrays( size_t first_index, size_t last_index )
/*******************************************************/
{
    keyword_t i;
    letter_t c;

    for( c = LETTER_MIN; c <= LETTER_MAX; ++c ) {
        freq[ c ] = 0;
    }
    for( i = 1; i <= num_keywords; ++i ) {
        len[ i ] = strlen( tokens[ i ] );
        if( len[i] > first_index ) {
            first[i] = make_letter( tokens[i][first_index] );
        } else {
            first[i] = make_letter( tokens[i][ len[i] - 1 ] );
        }
        if( len[i] > last_index ) {
            last[i] = make_letter( tokens[i][ len[i] - ( last_index + 1 ) ] );
        } else {
            last[i] = make_letter( tokens[i][0] );
        }
        done[ i ] = 0;
        // these improve the hash function
        init_hash[ i ] = len[ i ] + tokens[i][ min_len ];
        //init_hash[ i ] = len[ i ] + tokens[i][ len[i] >> 1 ];
        //init_hash[ i ] = len[ i ];
        hash[ i ] = init_hash[ i ];
        ++freq[ first[ i ] ];
        ++freq[ last[ i ] ];
    }
    for( i = 1; i <= num_keywords; ++i ) {
        ordered[ i ] = 0;
    }
    for( i = 0; i < hashsize; ++i ) {
        used[ i ] = NULL_KEYWORD;
    }
    // weight 0 is never used so it can be used in weights[] to see
    // what letters have weights
    for( i = 1; i <= MAX_WEIGHTS; ++i ) {
        available[ i ] = TRUE;
    }
}

void sort_frequency( void )
/*************************/
{
    letter_t previous;
    letter_t after_c;
    letter_t c;
    char prt_c;
    boolean change;

    /* find the alphabetic character that occurs the most in the keywords */
    most_used_character = LETTER_MIN;
    for( c = LETTER_MIN; c <= LETTER_MAX; ++c ) {
        if( c == LETTER__ )
            continue;
        if( freq[ c ] > freq[ most_used_character ] ) {
            most_used_character = c;
        }
    }
    /* make the next[] links into a ring */
    for( c = LETTER_MIN; c < LETTER_MAX; ++c ) {
        next[ c ] = c + 1;
    }
    next[ LETTER_MAX ] = LETTER_MIN;
    /* sort the list of characters in descending order of frequency */
    do {
        change = FALSE;
        c = most_used_character;
        for(;;) {
            previous = c;
            c = next[ previous ];
            after_c = next[ c ];
            if( after_c == most_used_character )
                break;
            if( freq[ c ] < freq[ after_c ] ) {
                /* exchange 'c' and 'after_c' */
                next[ c ] = next[ after_c ];
                next[ after_c ] = c;
                next[ previous ] = after_c;
                change = TRUE;
            }
        }
    } while( change );
    /* sort lists of equal frequency characters in ascending order */
    do {
        change = FALSE;
        c = most_used_character;
        for(;;) {
            previous = c;
            c = next[ previous ];
            after_c = next[ c ];
            if( after_c == most_used_character )
                break;
            if( freq[ c ] == freq[ after_c ] ) {
                if( c > after_c ) {
                    /* exchange 'c' and 'after_c' */
                    next[ c ] = next[ after_c ];
                    next[ after_c ] = c;
                    next[ previous ] = after_c;
                    change = TRUE;
                }
            }
        }
    } while( change );
    output( "frequency ordering of characters: " );
    /* update the prev pointers to reflect the new ordering */
    c = most_used_character;
    do {
        if( freq[ c ] != 0 ) {
            prt_c = make_char( c );
            if( isprint( prt_c ) ) {
                output( "%c", prt_c );
            } else {
                output( "\\x%02x", prt_c );
            }
        }
        after_c = next[ c ];
        prev[ after_c ] = c;
        c = after_c;
    } while( c != most_used_character );
    output( "\n" );
}

unsigned do_hash( unsigned x )
{
    x &= hashmask-1;
    if( x >= hashsize )
        x -= hashsize;
    return x;
}

void undo( letter_t c, keyword_t i )
/**********************************/
{
    keyword_t index;
    unsigned first_weight;
    unsigned last_weight;

    /*
        every keyword that had a full hash value calculated
        because of the weight of the character specified
        must deregister its hash value position
    */
    first_weight = first_scale * weights[c];
    last_weight = last_scale * weights[c];
    for( ; i > 0; --i ) {
        if( first[ i ] == c ) {
            --done[ i ];
            if( done[ i ] == 1 ) {      // 2 -> 1 transition
                index = do_hash( hash[ i ] );
                used[ index ] = NULL_KEYWORD;
            }
            hash[ i ] -= first_weight;
        }
        if( last[ i ] == c ) {
            --done[ i ];
            if( done[ i ] == 1 ) {      // 2 -> 1 transition
                index = do_hash( hash[ i ] );
                used[ index ] = NULL_KEYWORD;
            }
            hash[ i ] -= last_weight;
        }
    }
}

boolean share_letter( letter_t c, letter_t *p1, letter_t *p2 )
/************************************************************/
{
    keyword_t i;
    unsigned h;

    for( i = 0; i < hashsize; ++i ) {
        partial[i] = NULL_KEYWORD;
    }
    for( i=1; i <= num_keywords; ++i ) {
        if( done[i] == 1 ) {
            if( p1[i] == c || p2[i] == c ) {
                h = do_hash( hash[i] );
                if( partial[h] != NULL_KEYWORD ) {
                    ++collisions[i];
                    ++collisions[ partial[h] ];
                    return( TRUE );
                }
                partial[h] = i;
            }
        }
    }
    return( FALSE );
}

boolean check( letter_t c )
/*************************/
{
    /*
        - select all keywords with one more weight to add
        - if any keyword in this set shares a letter with
          another keyword in the set and the hash values
          are identical, the current set of weights cannot work
          since adding the same weight to both hashes will
          not make them different
    */

    if( first_scale == 1 && last_scale == 1 ) {
        while( freq[c] != 0 && c != most_used_character ) {
            if( share_letter( c, first, last ) ) {
                return( FALSE );
            }
            c = next[c];
        }
    } else {
        while( freq[c] != 0 && c != most_used_character ) {
            if( share_letter( c, first, first ) ) {
                return( FALSE );
            }
            if( share_letter( c, last, last ) ) {
                return( FALSE );
            }
            c = next[c];
        }
    }
    return( TRUE );
}

boolean try_hash( letter_t c )
/****************************/
{
    unsigned found;
    keyword_t index;
    keyword_t i;
    boolean works;
    unsigned first_weight;
    unsigned last_weight;
    unsigned adjust;

    works = TRUE;
    first_weight = first_scale * weights[c];
    last_weight = last_scale * weights[c];
    for( i = 1; i <= num_keywords; ++i ) {
        if( first[ i ] == c ) {
            if( last[ i ] == c ) {
                adjust = first_weight + last_weight;
                found = 2;
            } else {
                adjust = first_weight;
                found = 1;
            }
        } else if( last[ i ] == c ) {
            adjust = last_weight;
            found = 1;
        } else {
            continue;
        }
        if( done[i] + found == 2 ) {
            index = do_hash( hash[ i ] + adjust );
            if( used[ index ] != NULL_KEYWORD ) {
                works = FALSE;
                break;
            }
            done[ i ] += found;
            hash[ i ] += adjust;
            used[ index ] = i;
        } else {
            done[ i ] += found;
            hash[ i ] += adjust;
        }
    }
    if( works == TRUE ) {
        works = check( next[ c ] );
    }
    if( works == FALSE ) {
        undo( c, i - 1 );
    }
    return( works );
}

unsigned next_weight( void )
/**************************/
{
    int i;

    for( i = 1; i <= MAX_WEIGHTS; ++i ) {
        if( available[i] ) {
            return( i );
        }
    }
    return( 0 );
}

void try_for_hash( void )
/***********************/
{
    boolean     works;
    char        c;
    keyword_t   search;

    c = most_used_character;
    weights[ c ] = 1;
    available[ weights[c] ] = FALSE;
    do {
        works = try_hash( c );
        if( works ) {
            available[ weights[ c ] ] = FALSE;
            c = next[ c ];
            if( c == most_used_character )
                break;
            search = next_weight();
            if( search == 0 )
                break;
            weights[ c ] = search;
        } else {
            do {
                while( weights[c] == num_keywords && c != most_used_character ){
                    c = prev[ c ];
                    available[ weights[c] ] = TRUE;
                    undo( c, num_keywords );
                }
                if( c == most_used_character )
                    break;
                weights[ c ]++;
            } while( ! available[ weights[c] ] );
        }
    } while( ( freq[ c ] != 0 ) && ( c != most_used_character ) );
    while( c != most_used_character ) { /* initialize rest of weights */
        search = next_weight();
        if( search == 0 )

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?