findhash.c

来自「开放源码的编译器open watcom 1.6.0版的源代码」· C语言代码 · 共 1,153 行 · 第 1/3 页
1,153 行
            if( fscanf( fp, "%s %s\n", keyword, class ) != 2 )
                break;
            if( size > MAX_KEYWORDS ) {
                fatal( "too many keywords" );
            }
            key_len = strlen( keyword );
            if( key_len < min_len ) {
                min_len = key_len;
            }
            if( key_len > max_len ) {
                max_len = key_len;
            }
            if( key_len >= ( CHAR_BIT * sizeof( len_mask ) ) ) {
                fatal( "keyword is too long" );
            }
            len_mask |= ( 1L << key_len );
            position[size] = size;
            tokens[size] = strdup( keyword );
            if( strcmp( class, "TC_A?" ) == 0 ) {
                sprintf( class, "TC_A%u", 3 - ( key_len & 0x03 ) );
            }
            token_class[size] = strdup( class );
            ++size;
        }
        if( !feof( fp ) ) {
            fatal( "invalid token file" );
        }
        fclose( fp );
    }
    num_keywords = size - 1;
    if( num_keywords == 0 ) {
        fatal( "no keywords" );
    }
    qsort( &position[1], num_keywords, sizeof( int ), cmptok );
    for( i = 1; i < num_keywords; ++i ) {
        for( j = i; j < num_keywords; ++j ) {
            if( position[j] == i ) {
                break;
            }
        }
        k = position[i];
        swap( &tokens[k], &tokens[i] );
        swap( &token_class[k], &token_class[i] );
        position[j] = k;
    }
    col = 1;
    for( i = 1; i <= num_keywords; ++i ) {
        tok_len = strlen( tokens[i] );
        col += tok_len + 1;
        if( col > 79 ) {
            output( "\n" );
            col = tok_len + 2;
        }
        output( "%s ", tokens[i] );
    }
    output( "\n%u keywords min_len=%u max_len=%u\n", num_keywords, min_len, max_len );
}

void init_arrays( size_t first_index, size_t last_index )
/*******************************************************/
{
    keyword_t i;
    letter_t c;

    for( c = LETTER_MIN; c <= LETTER_MAX; ++c ) {
        freq[ c ] = 0;
    }
    for( i = 1; i <= num_keywords; ++i ) {
        len[ i ] = strlen( tokens[ i ] );
        if( len[i] > first_index ) {
            first[i] = make_letter( tokens[i][first_index] );
        } else {
            first[i] = make_letter( tokens[i][ len[i] - 1 ] );
        }
        if( len[i] > last_index ) {
            last[i] = make_letter( tokens[i][ len[i] - ( last_index + 1 ) ] );
        } else {
            last[i] = make_letter( tokens[i][0] );
        }
        done[ i ] = 0;
        // these improve the hash function
        init_hash[ i ] = len[ i ] + tokens[i][ min_len ];
        //init_hash[ i ] = len[ i ] + tokens[i][ len[i] >> 1 ];
        //init_hash[ i ] = len[ i ];
        hash[ i ] = init_hash[ i ];
        ++freq[ first[ i ] ];
        ++freq[ last[ i ] ];
    }
    for( i = 1; i <= num_keywords; ++i ) {
        ordered[ i ] = 0;
    }
    for( i = 0; i < hashsize; ++i ) {
        used[ i ] = NULL_KEYWORD;
    }
    // weight 0 is never used so it can be used in weights[] to see
    // what letters have weights
    for( i = 1; i <= MAX_WEIGHTS; ++i ) {
        available[ i ] = TRUE;
    }
}

void sort_frequency( void )
/*************************/
{
    letter_t previous;
    letter_t after_c;
    letter_t c;
    char prt_c;
    boolean change;

    /* find the alphabetic character that occurs the most in the keywords */
    most_used_character = LETTER_MIN;
    for( c = LETTER_MIN; c <= LETTER_MAX; ++c ) {
        if( c == LETTER__ )
            continue;
        if( freq[ c ] > freq[ most_used_character ] ) {
            most_used_character = c;
        }
    }
    /* make the next[] links into a ring */
    for( c = LETTER_MIN; c < LETTER_MAX; ++c ) {
        next[ c ] = c + 1;
    }
    next[ LETTER_MAX ] = LETTER_MIN;
    /* sort the list of characters in descending order of frequency */
    do {
        change = FALSE;
        c = most_used_character;
        for(;;) {
            previous = c;
            c = next[ previous ];
            after_c = next[ c ];
            if( after_c == most_used_character )
                break;
            if( freq[ c ] < freq[ after_c ] ) {
                /* exchange 'c' and 'after_c' */
                next[ c ] = next[ after_c ];
                next[ after_c ] = c;
                next[ previous ] = after_c;
                change = TRUE;
            }
        }
    } while( change );
    /* sort lists of equal frequency characters in ascending order */
    do {
        change = FALSE;
        c = most_used_character;
        for(;;) {
            previous = c;
            c = next[ previous ];
            after_c = next[ c ];
            if( after_c == most_used_character )
                break;
            if( freq[ c ] == freq[ after_c ] ) {
                if( c > after_c ) {
                    /* exchange 'c' and 'after_c' */
                    next[ c ] = next[ after_c ];
                    next[ after_c ] = c;
                    next[ previous ] = after_c;
                    change = TRUE;
                }
            }
        }
    } while( change );
    output( "frequency ordering of characters: " );
    /* update the prev pointers to reflect the new ordering */
    c = most_used_character;
    do {
        if( freq[ c ] != 0 ) {
            prt_c = make_char( c );
            if( isprint( prt_c ) ) {
                output( "%c", prt_c );
            } else {
                output( "\\x%02x", prt_c );
            }
        }
        after_c = next[ c ];
        prev[ after_c ] = c;
        c = after_c;
    } while( c != most_used_character );
    output( "\n" );
}

unsigned do_hash( unsigned x )
{
    x &= hashmask-1;
    if( x >= hashsize )
        x -= hashsize;
    return x;
}

void undo( letter_t c, keyword_t i )
/**********************************/
{
    keyword_t index;
    unsigned first_weight;
    unsigned last_weight;

    /*
        every keyword that had a full hash value calculated
        because of the weight of the character specified
        must deregister its hash value position
    */
    first_weight = first_scale * weights[c];
    last_weight = last_scale * weights[c];
    for( ; i > 0; --i ) {
        if( first[ i ] == c ) {
            --done[ i ];
            if( done[ i ] == 1 ) {      // 2 -> 1 transition
                index = do_hash( hash[ i ] );
                used[ index ] = NULL_KEYWORD;
            }
            hash[ i ] -= first_weight;
        }
        if( last[ i ] == c ) {
            --done[ i ];
            if( done[ i ] == 1 ) {      // 2 -> 1 transition
                index = do_hash( hash[ i ] );
                used[ index ] = NULL_KEYWORD;
            }
            hash[ i ] -= last_weight;
        }
    }
}

boolean share_letter( letter_t c, letter_t *p1, letter_t *p2 )
/************************************************************/
{
    keyword_t i;
    unsigned h;

    for( i = 0; i < hashsize; ++i ) {
        partial[i] = NULL_KEYWORD;
    }
    for( i=1; i <= num_keywords; ++i ) {
        if( done[i] == 1 ) {
            if( p1[i] == c || p2[i] == c ) {
                h = do_hash( hash[i] );
                if( partial[h] != NULL_KEYWORD ) {
                    ++collisions[i];
                    ++collisions[ partial[h] ];
                    return( TRUE );
                }
                partial[h] = i;
            }
        }
    }
    return( FALSE );
}

boolean check( letter_t c )
/*************************/
{
    /*
        - select all keywords with one more weight to add
        - if any keyword in this set shares a letter with
          another keyword in the set and the hash values
          are identical, the current set of weights cannot work
          since adding the same weight to both hashes will
          not make them different
    */

    if( first_scale == 1 && last_scale == 1 ) {
        while( freq[c] != 0 && c != most_used_character ) {
            if( share_letter( c, first, last ) ) {
                return( FALSE );
            }
            c = next[c];
        }
    } else {
        while( freq[c] != 0 && c != most_used_character ) {
            if( share_letter( c, first, first ) ) {
                return( FALSE );
            }
            if( share_letter( c, last, last ) ) {
                return( FALSE );
            }
            c = next[c];
        }
    }
    return( TRUE );
}

boolean try_hash( letter_t c )
/****************************/
{
    unsigned found;
    keyword_t index;
    keyword_t i;
    boolean works;
    unsigned first_weight;
    unsigned last_weight;
    unsigned adjust;

    works = TRUE;
    first_weight = first_scale * weights[c];
    last_weight = last_scale * weights[c];
    for( i = 1; i <= num_keywords; ++i ) {
        if( first[ i ] == c ) {
            if( last[ i ] == c ) {
                adjust = first_weight + last_weight;
                found = 2;
            } else {
                adjust = first_weight;
                found = 1;
            }
        } else if( last[ i ] == c ) {
            adjust = last_weight;
            found = 1;
        } else {
            continue;
        }
        if( done[i] + found == 2 ) {
            index = do_hash( hash[ i ] + adjust );
            if( used[ index ] != NULL_KEYWORD ) {
                works = FALSE;
                break;
            }
            done[ i ] += found;
            hash[ i ] += adjust;
            used[ index ] = i;
        } else {
            done[ i ] += found;
            hash[ i ] += adjust;
        }
    }
    if( works == TRUE ) {
        works = check( next[ c ] );
    }
    if( works == FALSE ) {
        undo( c, i - 1 );
    }
    return( works );
}

unsigned next_weight( void )
/**************************/
{
    int i;

    for( i = 1; i <= MAX_WEIGHTS; ++i ) {
        if( available[i] ) {
            return( i );
        }
    }
    return( 0 );
}

void try_for_hash( void )
/***********************/
{
    boolean     works;
    char        c;
    keyword_t   search;

    c = most_used_character;
    weights[ c ] = 1;
    available[ weights[c] ] = FALSE;
    do {
        works = try_hash( c );
        if( works ) {
            available[ weights[ c ] ] = FALSE;
            c = next[ c ];
            if( c == most_used_character )
                break;
            search = next_weight();
            if( search == 0 )
                break;
            weights[ c ] = search;
        } else {
            do {
                while( weights[c] == num_keywords && c != most_used_character ){
                    c = prev[ c ];
                    available[ weights[c] ] = TRUE;
                    undo( c, num_keywords );
                }
                if( c == most_used_character )
                    break;
                weights[ c ]++;
            } while( ! available[ weights[c] ] );
        }
    } while( ( freq[ c ] != 0 ) && ( c != most_used_character ) );
    while( c != most_used_character ) { /* initialize rest of weights */
        search = next_weight();
        if( search == 0 )
findhash.c - 源码说明

本页面展示了「开放源码的编译器open watcom 1.6.0版的源代码」中的 findhash.c 源码文件，采用 C语言编程语言编写，共 1,153 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫开发者社区收录了大量与Open Watcom相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?