⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 libs-mktdt.c

📁 Language, Script, and Encoding Identification with String Kernel Classifiers
💻 C
字号:
/*** Copyright (C) 2006 Thai Computational Linguistics Laboratory (TCL)** National Institute of Information and Communications Technology (NICT)** Canasai Kruengkrai <canasai xx gmail yy com, where xx=at and yy=dot>**** This file is part of the `libs' library.**** This library is free software; you can redistribute it and/or modify** it under the terms of the GNU General Public License as published by** the Free Software Foundation; either version 2 of the License, or** (at your option) any later version.**** This program is distributed in the hope that it will be useful,** but WITHOUT ANY WARRANTY; without even the implied warranty of** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the** GNU General Public License for more details.**** You should have received a copy of the GNU General Public License** along with this program; if not, write to the Free Software** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.*/#include <string.h>#include <ctype.h>#include <math.h>#include <errno.h>#include <dirent.h>#include <stdio.h>#include <stdlib.h>#include <time.h>#include <iconv.h>#ifdef __cplusplusextern "C" {#include "text.h"#endif#ifdef __cplusplus}#endif#define file_error( msg )   fprintf( stderr, "\nFILE ERROR: Could not open the file: %s\n", msg ), exit( 0 )#define fatal_error( msg )  fprintf( stderr, "\n%s\n", msg ), exit( 0 )#define malloc_error( str1, str2 ) fprintf( stderr, "ERROR: In function `%s': Could not allocate memory for `%s'\n", str1, str2 ), exit( 1 )#define Malloc( type, n ) ( type * )malloc( ( n ) * sizeof( type ) )#define MAX_STRING_LEN 1024#define FOTMATTED_DIR "fm"#define TAGGED_FILE "tagged"#define LABEL_FILE "label"typedef struct {    char *resource_directory;    int num_samples_per_class;    int sample_length;    char *tagged_file;    char *label_file;} TDTParam;TDTParam *read_config_file( char *conf_file ){    int num_tokens   = 0;    int num_lines    = 0;    int longest_line = 0;    text_scan( conf_file, &num_tokens, &num_lines, &longest_line );    char *line;    if( ( line = Malloc( char, longest_line ) ) == NULL )        malloc_error( "read_config_file", "line" );    TDTParam *param = Malloc( TDTParam, 1 );    char name[MAX_STRING_LEN];    char value[MAX_STRING_LEN];    FILE *file_ptr = fopen( conf_file, "r" );    while( fgets( line, longest_line, file_ptr ) != NULL )    {        if( text_not_blank( line ) && line[0] != '#' )        {            if( line[ strlen( line ) - 1 ] == '\n' )                line[ strlen( line ) - 1 ] = '\0';            sscanf( line, "%[^=]=%[^=]", name, value );            if( strcmp( name, "resource_directory" ) == 0 )                param->resource_directory = text_copy( value );            else if( strcmp( name, "num_samples_per_class" ) == 0 )                param->num_samples_per_class = atoi( value );            else if( strcmp( name, "sample_length" ) == 0 )                param->sample_length = atoi( value );        }    }    fclose( file_ptr );    free( line );    return( param );}void format_line( TDTParam *param, char *file_name ){    char *raw_file = text_append_with_char( param->resource_directory, file_name, '/' );    char *formatted_file = text_append_with_char( text_append_with_char( param->resource_directory, FOTMATTED_DIR, '/' ), file_name, '/' );    int num_tokens   = 0;    int num_lines    = 0;    int longest_line = 0;    text_scan( raw_file, &num_tokens, &num_lines, &longest_line );    char *line;    if( ( line = Malloc( char, longest_line ) ) == NULL )        malloc_error( "format_line", "line" );    FILE *file_ptr = fopen( raw_file, "r" );    FILE *file_ptr2 = fopen( formatted_file, "w" );    int fixed_length = param->sample_length;    while( fgets( line, longest_line, file_ptr ) != NULL )    {        // 1. check blank line and length        int line_length = strlen( line );        if( text_not_blank( line ) &&  line_length > fixed_length )        {            if( line[ strlen( line ) - 1 ] == '\n' )                line[ strlen( line ) - 1 ] = '\0';            //printf( "%s\n", line );            // 2. make formatted line with fixed line length            char *tmp_str = text_copy2( line );             int start = 0;            int end = fixed_length;            while( end < line_length )            {                int i, j;                char *sub_str = Malloc( char, ( end - start + 1 ) );                for( i = 0, j = start; j < end; i++, j++ )                {                    sub_str[i] = tmp_str[j];                }                sub_str[i] = '\0';                //printf( "[%d-%d]: %s\n", start, end, sub_str );                fprintf( file_ptr2, "%s\n", sub_str );                free( sub_str );                start = end;                end = start + fixed_length;            }            free( tmp_str );        }    }    //printf( "\n" );    fclose( file_ptr );    fclose( file_ptr2 );    free( line );    free( raw_file );}void make_formatted_data( TDTParam *param ){    struct dirent *pent;    DIR *pdir = opendir( param->resource_directory );    if( !pdir )    {        fprintf( stderr, "ERROR: opendir(%s) failure.\n", param->resource_directory );        exit( 1 );    }    errno = 0;    while( ( pent = readdir( pdir ) ) )    {        if( strstr( pent->d_name, ".txt" ) != NULL )        {            format_line( param, pent->d_name );        }    }    if( errno )    {        fprintf( stderr, "ERROR: readdir() failure %d.\n", errno );        exit( 1 );    }    closedir( pdir );}void append_samples( TDTParam *param, char *file_name, int class_index ){    char *formatted_file = text_append_with_char( text_append_with_char( param->resource_directory, FOTMATTED_DIR, '/' ), file_name, '/' );    //printf( "%s\n", formatted_file );    int num_tokens   = 0;    int num_lines    = 0;    int longest_line = 0;    text_scan( formatted_file, &num_tokens, &num_lines, &longest_line );    char *line;    if( ( line = Malloc( char, longest_line ) ) == NULL )        malloc_error( "append_samples", "line" );    int *line_index;    if( ( line_index = Malloc( int, num_lines ) ) == NULL )        malloc_error( "append_samples", "line_index" );    char **line_set;    if( ( line_set = Malloc( char *, num_lines ) ) == NULL )        malloc_error( "append_samples", "line_set" );    // 1. Read all lines    FILE *file_ptr = fopen( formatted_file, "r" );    int n = 0;    while( fgets( line, longest_line, file_ptr ) != NULL )    {        if( text_not_blank( line ) )        {            if( line[ strlen( line ) - 1 ] == '\n' )                line[ strlen( line ) - 1 ] = '\0';            line_set[n] = text_copy( line );            line_index[n] = n;            n++;        }    }    fclose( file_ptr );    num_lines = n;    // 2. Random shuffle line index    int i, j, k;    for( i = 0; i < num_lines; i++ )    {        j = i+rand()%(num_lines-i);        k = line_index[i];        line_index[i] = line_index[j];        line_index[j] = k;        //printf( "%d %d\n", line_index[i], line_index[j] );    }    // Print formatted samples    file_ptr = fopen( param->tagged_file, "a" );    j = 0;    for( i = 0; i < num_lines && i < param->num_samples_per_class; i++ )    {        fprintf( file_ptr, "%d %s\n", class_index, line_set[line_index[i]] );        j++;    }    fclose( file_ptr );    fprintf( stderr, "\n>  `%s', class[%d], #samples=%d", file_name, class_index, j );    file_ptr = fopen( param->label_file, "a" );    fprintf( file_ptr, "%d", class_index );    char **split_ptr = text_explode( file_name, "." );    char **split_ptr2 = split_ptr;    while( *split_ptr != NULL && strcmp( *split_ptr, "txt" ) != 0 )    {        fprintf( file_ptr, " %s", *split_ptr );        free( *split_ptr );        ++split_ptr;    }    free( split_ptr2 );    fprintf( file_ptr, "\n" );    fclose( file_ptr );    free( line_index );    free( line_set );    free( line );}void make_tagged_data( TDTParam *param ){    char *formatted_dir = text_append_with_char( param->resource_directory, FOTMATTED_DIR, '/' );    param->tagged_file = text_append_with_char( param->resource_directory, TAGGED_FILE, '/' );    param->label_file = text_append_with_char( param->resource_directory, LABEL_FILE, '/' );    fprintf( stderr, "\n> resource_directory = [%s]", param->resource_directory );    fprintf( stderr, "\n> write generated tagged file = [%s]", param->tagged_file );    fprintf( stderr, "\n> write generated label file = [%s]", param->label_file );    fprintf( stderr, "\n> sample_length = [%d]", param->sample_length );    fprintf( stderr, "\n> expected num_samples_per_class = [%d]", param->num_samples_per_class );    // clear old info    FILE *file_ptr;    file_ptr = fopen( param->tagged_file, "w" );    fclose( file_ptr );    file_ptr = fopen( param->label_file, "w" );    fclose( file_ptr );    struct dirent *pent;    DIR *pdir = opendir( formatted_dir );    if( !pdir )    {        fprintf( stderr, "ERROR: opendir(%s) failure.\n", formatted_dir );        exit( 1 );    }    errno = 0;    int class_index = 0;    while( ( pent = readdir( pdir ) ) )    {        if( strstr( pent->d_name, ".txt" ) != NULL )        {//printf( "%s\n", pent->d_name );            append_samples( param, pent->d_name, class_index );            class_index++;        }    }    if( errno )    {        fprintf( stderr, "ERROR: readdir() failure %d.\n", errno );        exit( 1 );    }    closedir( pdir );}void exit_with_help( char *prog_name ){    fprintf( stderr, "\nUsage:"                     "\n------"                     "\n%s config_file\n\n", prog_name );    exit( 0 );}int main( int argc, char **argv ){        char cmd[256];    if( argc < 2 )        exit_with_help( argv[0] );    fprintf( stderr, "> read config file..." );    TDTParam *param = read_config_file( argv[1] );    fprintf( stderr, "done\n" );    fprintf( stderr, "> make temporary formatted data..." );    sprintf( cmd, "mkdir %s/%s", param->resource_directory, FOTMATTED_DIR );    system( cmd );    make_formatted_data( param );    fprintf( stderr, "done\n" );    fprintf( stderr, "> make tagged data..." );    srand( time( NULL ) );    make_tagged_data( param );    fprintf( stderr, "\n> done\n" );    sprintf( cmd, "rm -rf %s/%s", param->resource_directory, FOTMATTED_DIR );    system( cmd );    free( param );    return( 0 );}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -