⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 string-kernels.c

📁 Language, Script, and Encoding Identification with String Kernel Classifiers
💻 C
字号:
/*** Copyright (C) 2006 Thai Computational Linguistics Laboratory (TCL)** National Institute of Information and Communications Technology (NICT)** Canasai Kruengkrai <canasai xx gmail yy com, where xx=at and yy=dot>**** This file is part of the `libs' library.**** This library is free software; you can redistribute it and/or modify** it under the terms of the GNU General Public License as published by** the Free Software Foundation; either version 2 of the License, or** (at your option) any later version.**** This program is distributed in the hope that it will be useful,** but WITHOUT ANY WARRANTY; without even the implied warranty of** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the** GNU General Public License for more details.**** You should have received a copy of the GNU General Public License** along with this program; if not, write to the Free Software** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.*/#include <math.h>#include <stdio.h>#include <stdlib.h>#include <ctype.h>#include <float.h>#include <string.h>#include <stdarg.h>#include "../svm/libsvm-string-2.71/svm.h"#include "suffix_tree.h"/* *  Compute the substring kernel between px and py using brute-force matching, *  where substrings up to length n are considered */double brute_force_full_substring( const struct svm_node *px, const struct svm_node *py, int n, double lambda ){    int k;    double ret = 0.0, tmp;    const struct svm_node *t1, *t2, *t3, *t4;    t1 = px;    while( t1->index != -1 )    {        t2 = py;        while( t2->index != -1 )        {            if( t1->value == t2->value )            {                t3 = t1;                t4 = t2;                tmp = lambda * lambda;                k = 0;                while( ( k < n ) &&                       ( t3->index != -1 ) &&                       ( t4->index != -1 ) &&                       ( t3->value == t4->value ) )                {                    ret += tmp;                    tmp *= ( lambda * lambda );                    ++k;                    ++t3;                    ++t4;                }            }            ++t2;        }        ++t1;    }    return( ret );}/* * Compute the substring kernel between px and py using brute-force matching, * where substrings of EXACTLY length n are considered */double brute_force_substring( const struct svm_node *px, const struct svm_node *py, int n, double lambda ){    int k;    double ret = 0.0, tmp;    const struct svm_node *t1, *t2, *t3, *t4;    t1 = px;    while( t1->index != -1 )     {        t2 = py;        while( t2->index != -1 )         {            if( t1->value == t2->value )            {                 t3 = t1;                t4 = t2;                tmp = lambda * lambda;                k = 0;                while( ( k < n ) &&                       ( t3->index != -1 ) &&                       ( t4->index != -1 ) &&                       ( t3->value == t4->value ) )                {                    tmp *= ( lambda * lambda );                    ++k;                    ++t3;                                           ++t4;                }                if( k == n )                   ret += tmp;            }            ++t2;        }        ++t1;    }    return( ret );}char *svm_node2string( const struct svm_node *x ){    const struct svm_node *tp = x;    int str_len = 0, i;    while( tp->index != -1 )    {        ++str_len;        ++tp;    }    char *str = ( char * )malloc( ( str_len + 1 ) * sizeof ( char ) );    for( i = 0; i < str_len; i++ )    {        str[i] = ( unsigned char )x->value;        ++x;    }    str[i++] = '\0';    return( str );}void search_occurrences( NODE* node, int *count ){    NODE* child = node->sons;    while( child != 0 )    {        if( !( child->path_position == child->father->path_position ) )            (*count)++;        if( child->sons != 0 )            search_occurrences( child, count );        child = child->right_sibling;    }    return;}/* * Compute the substring kernel between px and py using the suffix tree, * where substrings up to length n are considered */double suffix_tree_full_substring( const struct svm_node *px, const struct svm_node *py, int n, double lambda ){    const struct svm_node *t1 = px;    const struct svm_node *t2 = py;    char *str = svm_node2string( t2 );    // build suffix tree    SUFFIX_TREE *tree = ST_CreateTree( ( unsigned char * )str, ( DBL_WORD )strlen( str ) );    // search all possible substrings    double ret = 0.0, tmp;    int count = 0;    const struct svm_node *t3;    while( t1->index != -1 )    {        // start with the root's son        NODE *node = find_son( tree, tree->root, ( unsigned char )t1->value );        DBL_WORD str_len = 0;        t3 = t1;        tmp = ( lambda * lambda );        while( node != 0 )        {            DBL_WORD edge_label_start = node->edge_label_start;            DBL_WORD node_label_end = get_node_label_end( tree, node );            // scan the current edge            while( str_len < ( DBL_WORD )n &&                  t3->index != -1 &&                  edge_label_start <= node_label_end &&                  tree->tree_string[edge_label_start] == ( unsigned char )t3->value )            {                // compute the kernel score                count = 0;                search_occurrences( node, &count );                count++;                   ret += ( count * tmp );                // update score                tmp *= ( lambda * lambda );                ++t3;                ++str_len;                ++edge_label_start;            }            if( ( edge_label_start > node_label_end ) && ( t3->index != -1 ) )            {                // reach the end of the current edge, continue to the next edge                node = find_son( tree, node, ( unsigned char )t3->value );            }            else            {                node = 0;            }        }        ++t1;    }    ST_DeleteTree( tree );    free( str );    return( ret );}/* * Compute the substring kernel between px and py using the suffix tree, * where substrings of EXACTLY length n are considered */double suffix_tree_substring( const struct svm_node *px, const struct svm_node *py, int n, double lambda ){    const struct svm_node *t1 = px;    const struct svm_node *t2 = py;    // convert to original string    char *str = svm_node2string( t2 );    // build suffix tree    SUFFIX_TREE *tree = ST_CreateTree( ( unsigned char * )str, ( DBL_WORD )strlen( str ) );    free( str );    // search all possible substrings    double ret = 0.0, tmp;    int count = 0;    const struct svm_node *t3;    while( t1->index != -1 )    {        // start with the root's son        NODE *node = find_son( tree, tree->root, ( unsigned char )t1->value );        DBL_WORD str_len = 0;        t3 = t1;        tmp = ( lambda * lambda );        while( node != 0 )        {            DBL_WORD edge_label_start = node->edge_label_start;            DBL_WORD node_label_end = get_node_label_end( tree, node );            // scan the current edge            while( str_len < ( DBL_WORD )n &&                  t3->index != -1 &&                  edge_label_start <= node_label_end &&                  tree->tree_string[edge_label_start] == ( unsigned char )t3->value )            {                // update score                tmp *= ( lambda * lambda );                ++t3;                ++str_len;                ++edge_label_start;            }            if( str_len == ( DBL_WORD )n )            {                // compute the kernel score                count = 0;                search_occurrences( node, &count );                count++;                ret += ( count * tmp );                node = 0;            }            else if( ( edge_label_start > node_label_end ) && ( t3->index != -1 ) )            {                // reach the end of the current edge, continue to the next edge                node = find_son( tree, node, ( unsigned char )t3->value );            }            else            {                node = 0;            }        }        ++t1;    }    ST_DeleteTree( tree );    return( ret );}double string( const struct svm_node *px, const struct svm_node *py, double degree, double gamma, double coef0 ){    if( gamma == 0.0 )        return( brute_force_full_substring( px, py, ( int )degree, coef0 ) );    else if( gamma == 1.0 )        return( brute_force_substring( px, py, ( int )degree, coef0 ) );    else if( gamma == 2.0 )        return( suffix_tree_full_substring( px, py, ( int )degree, coef0 ) );    else if( gamma == 3.0 )        return( suffix_tree_substring( px, py, ( int )degree, coef0 ) );    else        return( 0.0 );}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -