📄 libs-mktdt.c
字号:
/*** Copyright (C) 2006 Thai Computational Linguistics Laboratory (TCL)** National Institute of Information and Communications Technology (NICT)** Canasai Kruengkrai <canasai xx gmail yy com, where xx=at and yy=dot>**** This file is part of the `libs' library.**** This library is free software; you can redistribute it and/or modify** it under the terms of the GNU General Public License as published by** the Free Software Foundation; either version 2 of the License, or** (at your option) any later version.**** This program is distributed in the hope that it will be useful,** but WITHOUT ANY WARRANTY; without even the implied warranty of** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the** GNU General Public License for more details.**** You should have received a copy of the GNU General Public License** along with this program; if not, write to the Free Software** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.*/#include <string.h>#include <ctype.h>#include <math.h>#include <errno.h>#include <dirent.h>#include <stdio.h>#include <stdlib.h>#include <time.h>#include <iconv.h>#ifdef __cplusplusextern "C" {#include "text.h"#endif#ifdef __cplusplus}#endif#define file_error( msg ) fprintf( stderr, "\nFILE ERROR: Could not open the file: %s\n", msg ), exit( 0 )#define fatal_error( msg ) fprintf( stderr, "\n%s\n", msg ), exit( 0 )#define malloc_error( str1, str2 ) fprintf( stderr, "ERROR: In function `%s': Could not allocate memory for `%s'\n", str1, str2 ), exit( 1 )#define Malloc( type, n ) ( type * )malloc( ( n ) * sizeof( type ) )#define MAX_STRING_LEN 1024#define FOTMATTED_DIR "fm"#define TAGGED_FILE "tagged"#define LABEL_FILE "label"typedef struct { char *resource_directory; int num_samples_per_class; int sample_length; char *tagged_file; char *label_file;} TDTParam;TDTParam *read_config_file( char *conf_file ){ int num_tokens = 0; int num_lines = 0; int longest_line = 0; text_scan( conf_file, &num_tokens, &num_lines, &longest_line ); char *line; if( ( line = Malloc( char, longest_line ) ) == NULL ) malloc_error( "read_config_file", "line" ); TDTParam *param = Malloc( TDTParam, 1 ); char name[MAX_STRING_LEN]; char value[MAX_STRING_LEN]; FILE *file_ptr = fopen( conf_file, "r" ); while( fgets( line, longest_line, file_ptr ) != NULL ) { if( text_not_blank( line ) && line[0] != '#' ) { if( line[ strlen( line ) - 1 ] == '\n' ) line[ strlen( line ) - 1 ] = '\0'; sscanf( line, "%[^=]=%[^=]", name, value ); if( strcmp( name, "resource_directory" ) == 0 ) param->resource_directory = text_copy( value ); else if( strcmp( name, "num_samples_per_class" ) == 0 ) param->num_samples_per_class = atoi( value ); else if( strcmp( name, "sample_length" ) == 0 ) param->sample_length = atoi( value ); } } fclose( file_ptr ); free( line ); return( param );}void format_line( TDTParam *param, char *file_name ){ char *raw_file = text_append_with_char( param->resource_directory, file_name, '/' ); char *formatted_file = text_append_with_char( text_append_with_char( param->resource_directory, FOTMATTED_DIR, '/' ), file_name, '/' ); int num_tokens = 0; int num_lines = 0; int longest_line = 0; text_scan( raw_file, &num_tokens, &num_lines, &longest_line ); char *line; if( ( line = Malloc( char, longest_line ) ) == NULL ) malloc_error( "format_line", "line" ); FILE *file_ptr = fopen( raw_file, "r" ); FILE *file_ptr2 = fopen( formatted_file, "w" ); int fixed_length = param->sample_length; while( fgets( line, longest_line, file_ptr ) != NULL ) { // 1. check blank line and length int line_length = strlen( line ); if( text_not_blank( line ) && line_length > fixed_length ) { if( line[ strlen( line ) - 1 ] == '\n' ) line[ strlen( line ) - 1 ] = '\0'; //printf( "%s\n", line ); // 2. make formatted line with fixed line length char *tmp_str = text_copy2( line ); int start = 0; int end = fixed_length; while( end < line_length ) { int i, j; char *sub_str = Malloc( char, ( end - start + 1 ) ); for( i = 0, j = start; j < end; i++, j++ ) { sub_str[i] = tmp_str[j]; } sub_str[i] = '\0'; //printf( "[%d-%d]: %s\n", start, end, sub_str ); fprintf( file_ptr2, "%s\n", sub_str ); free( sub_str ); start = end; end = start + fixed_length; } free( tmp_str ); } } //printf( "\n" ); fclose( file_ptr ); fclose( file_ptr2 ); free( line ); free( raw_file );}void make_formatted_data( TDTParam *param ){ struct dirent *pent; DIR *pdir = opendir( param->resource_directory ); if( !pdir ) { fprintf( stderr, "ERROR: opendir(%s) failure.\n", param->resource_directory ); exit( 1 ); } errno = 0; while( ( pent = readdir( pdir ) ) ) { if( strstr( pent->d_name, ".txt" ) != NULL ) { format_line( param, pent->d_name ); } } if( errno ) { fprintf( stderr, "ERROR: readdir() failure %d.\n", errno ); exit( 1 ); } closedir( pdir );}void append_samples( TDTParam *param, char *file_name, int class_index ){ char *formatted_file = text_append_with_char( text_append_with_char( param->resource_directory, FOTMATTED_DIR, '/' ), file_name, '/' ); //printf( "%s\n", formatted_file ); int num_tokens = 0; int num_lines = 0; int longest_line = 0; text_scan( formatted_file, &num_tokens, &num_lines, &longest_line ); char *line; if( ( line = Malloc( char, longest_line ) ) == NULL ) malloc_error( "append_samples", "line" ); int *line_index; if( ( line_index = Malloc( int, num_lines ) ) == NULL ) malloc_error( "append_samples", "line_index" ); char **line_set; if( ( line_set = Malloc( char *, num_lines ) ) == NULL ) malloc_error( "append_samples", "line_set" ); // 1. Read all lines FILE *file_ptr = fopen( formatted_file, "r" ); int n = 0; while( fgets( line, longest_line, file_ptr ) != NULL ) { if( text_not_blank( line ) ) { if( line[ strlen( line ) - 1 ] == '\n' ) line[ strlen( line ) - 1 ] = '\0'; line_set[n] = text_copy( line ); line_index[n] = n; n++; } } fclose( file_ptr ); num_lines = n; // 2. Random shuffle line index int i, j, k; for( i = 0; i < num_lines; i++ ) { j = i+rand()%(num_lines-i); k = line_index[i]; line_index[i] = line_index[j]; line_index[j] = k; //printf( "%d %d\n", line_index[i], line_index[j] ); } // Print formatted samples file_ptr = fopen( param->tagged_file, "a" ); j = 0; for( i = 0; i < num_lines && i < param->num_samples_per_class; i++ ) { fprintf( file_ptr, "%d %s\n", class_index, line_set[line_index[i]] ); j++; } fclose( file_ptr ); fprintf( stderr, "\n> `%s', class[%d], #samples=%d", file_name, class_index, j ); file_ptr = fopen( param->label_file, "a" ); fprintf( file_ptr, "%d", class_index ); char **split_ptr = text_explode( file_name, "." ); char **split_ptr2 = split_ptr; while( *split_ptr != NULL && strcmp( *split_ptr, "txt" ) != 0 ) { fprintf( file_ptr, " %s", *split_ptr ); free( *split_ptr ); ++split_ptr; } free( split_ptr2 ); fprintf( file_ptr, "\n" ); fclose( file_ptr ); free( line_index ); free( line_set ); free( line );}void make_tagged_data( TDTParam *param ){ char *formatted_dir = text_append_with_char( param->resource_directory, FOTMATTED_DIR, '/' ); param->tagged_file = text_append_with_char( param->resource_directory, TAGGED_FILE, '/' ); param->label_file = text_append_with_char( param->resource_directory, LABEL_FILE, '/' ); fprintf( stderr, "\n> resource_directory = [%s]", param->resource_directory ); fprintf( stderr, "\n> write generated tagged file = [%s]", param->tagged_file ); fprintf( stderr, "\n> write generated label file = [%s]", param->label_file ); fprintf( stderr, "\n> sample_length = [%d]", param->sample_length ); fprintf( stderr, "\n> expected num_samples_per_class = [%d]", param->num_samples_per_class ); // clear old info FILE *file_ptr; file_ptr = fopen( param->tagged_file, "w" ); fclose( file_ptr ); file_ptr = fopen( param->label_file, "w" ); fclose( file_ptr ); struct dirent *pent; DIR *pdir = opendir( formatted_dir ); if( !pdir ) { fprintf( stderr, "ERROR: opendir(%s) failure.\n", formatted_dir ); exit( 1 ); } errno = 0; int class_index = 0; while( ( pent = readdir( pdir ) ) ) { if( strstr( pent->d_name, ".txt" ) != NULL ) {//printf( "%s\n", pent->d_name ); append_samples( param, pent->d_name, class_index ); class_index++; } } if( errno ) { fprintf( stderr, "ERROR: readdir() failure %d.\n", errno ); exit( 1 ); } closedir( pdir );}void exit_with_help( char *prog_name ){ fprintf( stderr, "\nUsage:" "\n------" "\n%s config_file\n\n", prog_name ); exit( 0 );}int main( int argc, char **argv ){ char cmd[256]; if( argc < 2 ) exit_with_help( argv[0] ); fprintf( stderr, "> read config file..." ); TDTParam *param = read_config_file( argv[1] ); fprintf( stderr, "done\n" ); fprintf( stderr, "> make temporary formatted data..." ); sprintf( cmd, "mkdir %s/%s", param->resource_directory, FOTMATTED_DIR ); system( cmd ); make_formatted_data( param ); fprintf( stderr, "done\n" ); fprintf( stderr, "> make tagged data..." ); srand( time( NULL ) ); make_tagged_data( param ); fprintf( stderr, "\n> done\n" ); sprintf( cmd, "rm -rf %s/%s", param->resource_directory, FOTMATTED_DIR ); system( cmd ); free( param ); return( 0 );}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -