⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 libs-standalone.c

📁 Language, Script, and Encoding Identification with String Kernel Classifiers
💻 C
字号:
/*** Copyright (C) 2006 Thai Computational Linguistics Laboratory (TCL)** National Institute of Information and Communications Technology (NICT)** Canasai Kruengkrai <canasai xx gmail yy com, where xx=at and yy=dot>**** This file is part of the `libs' library.**** This library is free software; you can redistribute it and/or modify** it under the terms of the GNU General Public License as published by** the Free Software Foundation; either version 2 of the License, or** (at your option) any later version.**** This program is distributed in the hope that it will be useful,** but WITHOUT ANY WARRANTY; without even the implied warranty of** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the** GNU General Public License for more details.**** You should have received a copy of the GNU General Public License** along with this program; if not, write to the Free Software** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.*/#include <stdio.h>#include <ctype.h>#include <stdlib.h>#include <string.h>#include <unistd.h>#ifdef __cplusplusextern "C" {#include "CKHash.h"#endif#ifdef __cplusplus}#endif#include "libsvm-string-2.71/svm.h"inline void   Swap( int &a, int &b ) { int c = a; a = b; b = c; }#define file_error( msg )   fprintf( stderr, "\nFILE ERROR: Could not open the file: %s\n", msg ), exit( 0 )#define fatal_error( msg )  fprintf( stderr, "\n%s\n", msg ), exit( 0 )#define malloc_error( str1, str2 ) fprintf( stderr, "ERROR: In function `%s': Could not allocate memory for `%s'\n", str1, str2 ), exit( 1 )#define Malloc( type, n ) ( type * )malloc( ( n )*sizeof( type ) )#define MODEL "samples/default/model"#define LABEL "samples/default/label"#define _LOWER_BOUND_LENGTH_ 10#define _UPPER_BOUND_LENGTH_ 100#define _SAMPLING_SIZE_ 10typedef struct {                int LOWER_BOUND_LENGTH;    int UPPER_BOUND_LENGTH;    char *MODEL_FILE;    char *LABEL_FILE;    int SAMPLING_SIZE;} SAParam;SAParam sa_param;void exit_with_help( char *prog_name ){    fprintf( stderr, "\nUsage:"                     "\n------"                     "\n%s [options] target_file"                     "\noptions:"                     "\n  -l lower-bound: set minimum string length per line (default 15)"                     "\n  -u upper-bound: set maximum string length per line (default 100)"                     "\n  -m model_file: set model file (default %s)"                     "\n  -b label_file: set label file (default %s)"                     "\n  -s sampling_size: set number of randomly selected text samples (default 10)"                     "\n",                     prog_name, MODEL, LABEL );    exit( 0 );}void predict( svm_model *model,              char **label_name,              char *target_file ){    int num_tokens   = 0;    int num_lines    = 0;    int longest_line = 0;    text_scan( target_file, &num_tokens, &num_lines, &longest_line );    char *line = Malloc( char, longest_line );    FILE *file_ptr = fopen( target_file, "r" );    char **sample = Malloc( char *, num_lines );    int count = 0;    while( fgets( line, longest_line, file_ptr ) != NULL )    {        if( text_not_blank( line ) )        {            line[ strlen( line ) - 1 ] = '\0';            char *string = text_copy2( line );            int str_length = strlen( string );            if( str_length > sa_param.LOWER_BOUND_LENGTH )            {                if( str_length > sa_param.UPPER_BOUND_LENGTH )                {                    char *new_string;                    if( ( new_string = Malloc( char, sa_param.UPPER_BOUND_LENGTH + 1 ) ) == NULL )                        malloc_error( "predict", "new_string" );                    strncpy( new_string, string, sa_param.UPPER_BOUND_LENGTH );                    new_string[sa_param.UPPER_BOUND_LENGTH] = '\0';                    free( string );                    string = new_string;                }                sample[count++] = string;            }        }    }    fclose( file_ptr );    if( count == 0 )    {        fprintf( stderr, "Unidentified\n" );        exit( 0 );    }    if( count < sa_param.SAMPLING_SIZE )        sa_param.SAMPLING_SIZE = count;    //fprintf( stderr, "sa_param.SAMPLING_SIZE=%d | count=%d\n", sa_param.SAMPLING_SIZE, count );	// random shuffle    int i, j;	int *perm = Malloc( int, count );	for( i = 0 ; i < count; i++ )         perm[i] = i;	for( i = 0; i < count; i++ )	{		j = i + rand() % ( count - i );		Swap( perm[i], perm[j] );	}    Dict *lang_enc = ckh_construct_dict( 16 );    for( j = 0; j < sa_param.SAMPLING_SIZE; j++ )    {        int str_length = strlen( sample[perm[j]] );        struct svm_node *x = Malloc( struct svm_node, str_length+1 );        char *tmp_str = sample[perm[j]];        //fprintf( stderr, "%d %d %s\n", j, perm[j], tmp_str );        for( i = 0; i < str_length; i++ )        {            x[i].index = i;            x[i].value = ( unsigned char )tmp_str[i];        }        x[i++].index = -1;        // predict this sample        int v = ( int )svm_predict( model, x );        fprintf( stderr, "." );        //fprintf( stderr, "%d %s\n", v, label_name[v]  );        if( !ckh_lookup( lang_enc, label_name[v] ) )            ckh_insert( lang_enc, label_name[v], 1 );        else            ckh_increase_value( lang_enc, label_name[v] );        free( x );    }    fprintf( stderr, "*\n" );    int max_score = -1;    char *best_result = NULL;    for( j = 0; j < lang_enc->table_size; j++ )    {        if( lang_enc->T1[j].key != NULL && lang_enc->T1[j].value > max_score )        {            max_score = lang_enc->T1[j].value;            best_result = lang_enc->T1[j].key;        }        if( lang_enc->T2[j].key != NULL && lang_enc->T2[j].value > max_score )        {            max_score = lang_enc->T2[j].value;            best_result = lang_enc->T2[j].key;        }    }    //fprintf( stderr, "%s (%d) \n", best_result, max_score );    printf( "%s\n", best_result );    ckh_destruct_dict( lang_enc );    free( sample );     free( perm ); }void run( int argc, char **argv ){    int i;    sa_param.LOWER_BOUND_LENGTH = _LOWER_BOUND_LENGTH_;    sa_param.UPPER_BOUND_LENGTH = _UPPER_BOUND_LENGTH_;    sa_param.MODEL_FILE = text_copy( MODEL );    sa_param.LABEL_FILE = text_copy( LABEL );    sa_param.SAMPLING_SIZE = _SAMPLING_SIZE_;    // parse options    for( i = 1; i < argc; i++ )    {        if( argv[i][0] != '-')            break;        ++i;        switch( argv[i-1][1] )        {            case 'l':                sa_param.LOWER_BOUND_LENGTH = atoi( argv[i] );                break;            case 'u':                sa_param.UPPER_BOUND_LENGTH = atoi( argv[i] );                break;            case 'm':                sa_param.MODEL_FILE = text_copy( argv[i] );                break;            case 'b':                sa_param.LABEL_FILE = text_copy( argv[i] );                break;            case 's':                sa_param.SAMPLING_SIZE = atoi( argv[i] );                break;            default:                fprintf( stderr, "unknown option\n" );                exit_with_help( argv[0] );        }    }    if( i >= argc )        exit_with_help( argv[0] );     if( access( sa_param.MODEL_FILE, R_OK ) != 0 )        file_error( sa_param.MODEL_FILE );    if( access( sa_param.LABEL_FILE, R_OK ) != 0 )        file_error( sa_param.LABEL_FILE );    if( access( argv[i], R_OK ) != 0 )        file_error( argv[i] );    svm_model *model = svm_load_model( sa_param.MODEL_FILE );    int max_classname_length = 0;    char **label_name = load_label( sa_param.LABEL_FILE, &max_classname_length );    predict( model, label_name, argv[i] );    svm_destroy_model( model );}int main( int argc, char **argv ){    if( argc < 2 )        exit_with_help( argv[0] );    run( argc, argv );    return( 0 );}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -