⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 text.c

📁 Language, Script, and Encoding Identification with String Kernel Classifiers
💻 C
字号:
/*** Copyright (C) 2006 Thai Computational Linguistics Laboratory (TCL)** National Institute of Information and Communications Technology (NICT)** Canasai Kruengkrai <canasai xx gmail yy com, where xx=at and yy=dot>**** This file is part of the `libs' library.**** This library is free software; you can redistribute it and/or modify** it under the terms of the GNU General Public License as published by** the Free Software Foundation; either version 2 of the License, or** (at your option) any later version.**** This program is distributed in the hope that it will be useful,** but WITHOUT ANY WARRANTY; without even the implied warranty of** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the** GNU General Public License for more details.**** You should have received a copy of the GNU General Public License** along with this program; if not, write to the Free Software** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.*/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#include "text.h"#define Malloc( type, n ) ( type * )malloc( ( n ) * sizeof( type ) )#define malloc_error( str ) fprintf( stderr, "ERROR: In function `%s': Could not allocate memory\n", str ), exit( 1 )#define file_error( str1, str2 ) fprintf( stderr, "ERROR: In function `%s': Could not open file `%s'\n", str1, str2 ), exit( 1 )char *text_append_with_space( char *w1, char *w2 ){    char *result;    int str_len = strlen( w1 ) + strlen( w2 ) + 2;    if( ( result = Malloc( char, str_len ) ) == NULL )        malloc_error( "text_append_with_space" );    snprintf( result, str_len, "%s %s", w1, w2 );    return( result );}char *text_append_with_char( char *w1, char *w2, char w3 ){    char *result;    int str_len = strlen( w1 ) + strlen( w2 ) + 2;    if( ( result = Malloc( char, str_len ) ) == NULL )        malloc_error( "text_append_with_char" );    snprintf( result, str_len, "%s%c%s", w1, w3, w2 );    return( result );}char *text_append( char *w1, char *w2 ){    char *result;    int str_len = strlen( w1 ) + strlen( w2 ) + 1;    if( ( result = Malloc( char, str_len ) ) == NULL )        malloc_error( "text_append" );    snprintf( result, str_len, "%s%s", w1, w2 );    return( result );}char *text_sub_string_itself( char *old_string, int start, int end ){    char *new_string = text_sub_string( old_string, start, end );    free( old_string );    return( new_string );}char *text_append_itself_with_prefix( char *prefix, char *str_itself  ){    char *new_str_itself = text_append( prefix, str_itself );    free( str_itself );    return( new_str_itself );}char *text_append_itself_with_suffix( char *str_itself, char *suffix ){    char *new_str_itself = text_append( str_itself, suffix );    free( str_itself );    return( new_str_itself );}char *text_copy( char *str ){    char *result;    int str_len = strlen( str ) + 1;    if( ( result = Malloc( char, str_len ) ) == NULL )        malloc_error( "text_copy" );    strncpy( result, str, str_len );    return( result );}char *text_copy2( char *str ) {    char *result;    while( *str == ' ' || *str == '\t' )        ++str;    int str_len = strlen( str ) + 1;    if( ( result = Malloc( char, str_len ) ) == NULL )        malloc_error( "text_copy2" );    strncpy( result, str, str_len );    return( result );}char *text_copy3( char *str ){    char *result;    int str_len = strlen( str ) + 1;    if( ( result = Malloc( char, str_len ) ) == NULL )        malloc_error( "text_copy3" );    strncpy( result, str, str_len );    register char *i;    for( i = result; *i; i++ )     {        if( iscntrl( *i ) || isspace( *i ) )         {            *i = '\0';            break;        }    }    return( result ); }char *text_copy4( char *str ){    char *result;    int str_len = strlen( str ) + 1;    if( ( result = Malloc( char, str_len ) ) == NULL )        malloc_error( "text_copy4" );    strncpy( result, str, str_len );    register char *i;    for( i = result; *i ; i++ )    {        if( iscntrl( *i ) && *i != '\t' )         {            *i = '\0';            break;        }    }    return( result ); }int text_not_blank( char *str ){     char *tmp;    char c;    tmp = str;    while( *tmp != '\0' )     {        c = *tmp;        if( !isspace( c ) )            return( 1 );         ++tmp;    }    return( 0 );}int text_num_words( char *str ){    int count, count2;    count = 0;        count2 = 0;    while( str[ count ] != '\0' && isspace( str[ count ] ) )         ++count;    while( str[ count++ ] != '\0')     {        if( isspace( str[ count - 1 ] ) )         {            ++count2;            while( isspace( str[ count ] ) )                ++count;            if( str[ count ] == '\0' )                --count2;        }    }    return( count2 );}int text_num_spaces( char *str ){    int count, count2;    count2 = 0;    for( count = 0; count < strlen( str ); ++count )        if( isspace( str[ count ] ) )            ++count2;    return( count2 );}char **text_split_independent( char *str ){    char **new_str;    char *tmp;    int cntr = 0;    char *str1 = text_copy( str );    char *str2 = str1;    while( *str1 == ' ' || *str1 == '\t' )        ++str1;    if( ( new_str = Malloc( char *, text_num_spaces( str1 ) + 3 ) ) == NULL )        malloc_error( "text_split_independent" );    new_str[cntr++] = text_copy( ( char * )strtok( str1, "\t " ) );    while( ( tmp = strtok( NULL, "\t " ) ) != NULL )    {        new_str[cntr] = text_copy( tmp );        ++cntr;    }    new_str[cntr] = NULL;    free( str2 );    return( new_str );}char **text_split_with_delimiter( char *str, char *delimiter ){    char **new_str;    int cntr = 0;    int count = 0;    char *tmp = text_copy( str );    char *tmp2 = text_copy( ( char * )strtok( tmp, delimiter ) );    ++count;    while( ( tmp2 = strtok( NULL, delimiter ) ) != NULL )        ++count;    free( tmp );    if( ( new_str = Malloc( char *, count + 3 ) ) == NULL )        malloc_error( "text_split_with_delimiter" );    char *tmp3 = text_copy( str );    new_str[cntr++] = ( char * )text_copy( ( char * )strtok( tmp3, delimiter ) );    while( ( tmp = ( char * )strtok( NULL, delimiter ) ) != NULL )    {        new_str[cntr] = ( char * )text_copy( tmp );        ++cntr;    }    new_str[cntr] = NULL;    free( tmp3);    return( new_str );}char **text_explode( char *str, char *delimiter ){    int count = 0;    char *tmp_ptr;    char **new_str;    int cntr = 0;    char *str1 = text_copy( str );    char *str2 = str1;    while( ( tmp_ptr = strstr( str1, delimiter ) ) != NULL )    {        *tmp_ptr = '\0';        tmp_ptr += strlen( delimiter );        count++;        str1 = tmp_ptr;    }    free( str2 );    if( ( new_str = Malloc( char *, count + 2 ) ) == NULL )        malloc_error( "text_explode" );    str1 = text_copy( str );    str2 = str1;    while( ( tmp_ptr = strstr( str1, delimiter ) ) != NULL )    {        *tmp_ptr = '\0';        tmp_ptr += strlen( delimiter );        new_str[cntr++] = text_copy( str1 );        str1 = tmp_ptr;    }    new_str[cntr++] = text_copy( str1 );    new_str[cntr++] = NULL;    free( str2 );    return( new_str );}char *text_make_lower( char *str ){    register char *i;    char *result;    int str_len = strlen( str ) + 1;    if( ( result = Malloc( char, str_len ) ) == NULL )        malloc_error( "text_make_lower" );    strncpy( result, str, str_len );    for( i = result; *i ; i++ )         if( isupper( *i ) )             *i = tolower( *i );    return( result ); }        char *text_make_upper( char *str ){    register char *i;    char *result;    int str_len = strlen( str ) + 1;    if( ( result = Malloc( char, str_len ) ) == NULL )        malloc_error( "text_make_upper" );    strncpy( result, str, str_len );    for( i = result; *i ; i++ )         if( islower( *i ) )             *i = toupper( *i );    return( result ); }char *text_sub_string( char *str, int start, int end ){    char *sub_str;    int str_len = end - start + 1;    if( str_len <= 0 )    {        fprintf( stderr, "ERROR: In function `text_sub_string': str_len = %d\n", str_len );        exit( 0 );    }    if( ( sub_str = Malloc( char, str_len ) ) == NULL )        malloc_error( "text_sub_string" );    int i = 0, j;    for( j = start; j < end; j++ )    {        sub_str[i++] = str[j];    }    sub_str[i] = '\0';    return( sub_str );}void text_scan( char *file_name, int *num_tokens, int *num_lines, int *longest_line ){    FILE *file_ptr;    int tmp, token_count, line_count, ic;    char c;    if( ( file_ptr = fopen( file_name, "r" ) ) == NULL )        file_error( "text_scan", file_name );    (*longest_line) = 0;    (*num_tokens) = 0;    (*num_lines) = 0;    tmp = 0;    token_count = 0;    line_count = 0;    while( ( ic = getc( file_ptr ) ) != EOF )    {        c = ( char )ic;        if( c != '\n' )            tmp++;        if( c == '\n' )        {            line_count++;            token_count += 2;    // add two tokens for newline            if( tmp > (*longest_line) )                (*longest_line) = tmp;            tmp = 0;        }        if( isspace( c ) )            token_count += 2; // for a text segment and its space    }    // Ckeck the last token if the file does not end with any signal    if( tmp != 0 )    {        fprintf( stderr, "\nERROR: In function `text_scan': No newline at the end of file `%s'\n\n", file_name );        exit( 0 );    }    (*longest_line) += 10;    (*num_tokens) = token_count;    (*num_lines) = line_count;        fclose( file_ptr );}int text_found_char_in( char *any_string ){    unsigned char *i;    for( i = any_string; *i; i++ )    {      if( isalpha( *i ) )          return( 1 );    }    return( 0 );}int text_is_digit( char *any_string ){    unsigned char *i;    for( i = any_string; *i; i++ )    {      if( !isdigit( *i ) )          return( 0 );    }    return( 1 );}void text_string_scan( char *file_name, int *num_tokens, int *num_lines, int *longest_line, int *num_chars ){    FILE *file_ptr;    int tmp, token_count, line_count, ic, nc;    char c;    (*longest_line) = 0;    (*num_tokens) = 0;    (*num_lines) = 0;    (*num_chars) = 0;    if( ( file_ptr = fopen( file_name, "r" ) ) == NULL )        file_error( "text_string_scan", file_name );    tmp = token_count = line_count = nc = 0;    while( ( ic = getc( file_ptr ) ) != EOF )    {            c = (char)ic;            if( c != '\n' )                    tmp++;            if( c == '\n' )            {                    nc += ( tmp + 50 );                    line_count++;                    token_count += 2;    // add two tokens for newline                    if( tmp > (*longest_line) )                            (*longest_line) = tmp;                    tmp = 0;            }        if( isspace( c ) )                    token_count += 2; // for a text segment and its space    }    // Ckeck the last token if the file does not end with any signal    if( tmp != 0 )    {        fprintf( stderr, "\nERROR: In function `text_string_scan': No newline at the end of file `%s'\n\n", file_name );        exit( 0 );    }    (*longest_line) += 10;    (*num_tokens) = token_count;    (*num_lines) = line_count;    (*num_chars) = nc;    fclose( file_ptr );}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -