📄 string-kernels.c
字号:
/*** Copyright (C) 2006 Thai Computational Linguistics Laboratory (TCL)** National Institute of Information and Communications Technology (NICT)** Canasai Kruengkrai <canasai xx gmail yy com, where xx=at and yy=dot>**** This file is part of the `libs' library.**** This library is free software; you can redistribute it and/or modify** it under the terms of the GNU General Public License as published by** the Free Software Foundation; either version 2 of the License, or** (at your option) any later version.**** This program is distributed in the hope that it will be useful,** but WITHOUT ANY WARRANTY; without even the implied warranty of** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the** GNU General Public License for more details.**** You should have received a copy of the GNU General Public License** along with this program; if not, write to the Free Software** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.*/#include <math.h>#include <stdio.h>#include <stdlib.h>#include <ctype.h>#include <float.h>#include <string.h>#include <stdarg.h>#include "../svm/libsvm-string-2.71/svm.h"#include "suffix_tree.h"/* * Compute the substring kernel between px and py using brute-force matching, * where substrings up to length n are considered */double brute_force_full_substring( const struct svm_node *px, const struct svm_node *py, int n, double lambda ){ int k; double ret = 0.0, tmp; const struct svm_node *t1, *t2, *t3, *t4; t1 = px; while( t1->index != -1 ) { t2 = py; while( t2->index != -1 ) { if( t1->value == t2->value ) { t3 = t1; t4 = t2; tmp = lambda * lambda; k = 0; while( ( k < n ) && ( t3->index != -1 ) && ( t4->index != -1 ) && ( t3->value == t4->value ) ) { ret += tmp; tmp *= ( lambda * lambda ); ++k; ++t3; ++t4; } } ++t2; } ++t1; } return( ret );}/* * Compute the substring kernel between px and py using brute-force matching, * where substrings of EXACTLY length n are considered */double brute_force_substring( const struct svm_node *px, const struct svm_node *py, int n, double lambda ){ int k; double ret = 0.0, tmp; const struct svm_node *t1, *t2, *t3, *t4; t1 = px; while( t1->index != -1 ) { t2 = py; while( t2->index != -1 ) { if( t1->value == t2->value ) { t3 = t1; t4 = t2; tmp = lambda * lambda; k = 0; while( ( k < n ) && ( t3->index != -1 ) && ( t4->index != -1 ) && ( t3->value == t4->value ) ) { tmp *= ( lambda * lambda ); ++k; ++t3; ++t4; } if( k == n ) ret += tmp; } ++t2; } ++t1; } return( ret );}char *svm_node2string( const struct svm_node *x ){ const struct svm_node *tp = x; int str_len = 0, i; while( tp->index != -1 ) { ++str_len; ++tp; } char *str = ( char * )malloc( ( str_len + 1 ) * sizeof ( char ) ); for( i = 0; i < str_len; i++ ) { str[i] = ( unsigned char )x->value; ++x; } str[i++] = '\0'; return( str );}void search_occurrences( NODE* node, int *count ){ NODE* child = node->sons; while( child != 0 ) { if( !( child->path_position == child->father->path_position ) ) (*count)++; if( child->sons != 0 ) search_occurrences( child, count ); child = child->right_sibling; } return;}/* * Compute the substring kernel between px and py using the suffix tree, * where substrings up to length n are considered */double suffix_tree_full_substring( const struct svm_node *px, const struct svm_node *py, int n, double lambda ){ const struct svm_node *t1 = px; const struct svm_node *t2 = py; char *str = svm_node2string( t2 ); // build suffix tree SUFFIX_TREE *tree = ST_CreateTree( ( unsigned char * )str, ( DBL_WORD )strlen( str ) ); // search all possible substrings double ret = 0.0, tmp; int count = 0; const struct svm_node *t3; while( t1->index != -1 ) { // start with the root's son NODE *node = find_son( tree, tree->root, ( unsigned char )t1->value ); DBL_WORD str_len = 0; t3 = t1; tmp = ( lambda * lambda ); while( node != 0 ) { DBL_WORD edge_label_start = node->edge_label_start; DBL_WORD node_label_end = get_node_label_end( tree, node ); // scan the current edge while( str_len < ( DBL_WORD )n && t3->index != -1 && edge_label_start <= node_label_end && tree->tree_string[edge_label_start] == ( unsigned char )t3->value ) { // compute the kernel score count = 0; search_occurrences( node, &count ); count++; ret += ( count * tmp ); // update score tmp *= ( lambda * lambda ); ++t3; ++str_len; ++edge_label_start; } if( ( edge_label_start > node_label_end ) && ( t3->index != -1 ) ) { // reach the end of the current edge, continue to the next edge node = find_son( tree, node, ( unsigned char )t3->value ); } else { node = 0; } } ++t1; } ST_DeleteTree( tree ); free( str ); return( ret );}/* * Compute the substring kernel between px and py using the suffix tree, * where substrings of EXACTLY length n are considered */double suffix_tree_substring( const struct svm_node *px, const struct svm_node *py, int n, double lambda ){ const struct svm_node *t1 = px; const struct svm_node *t2 = py; // convert to original string char *str = svm_node2string( t2 ); // build suffix tree SUFFIX_TREE *tree = ST_CreateTree( ( unsigned char * )str, ( DBL_WORD )strlen( str ) ); free( str ); // search all possible substrings double ret = 0.0, tmp; int count = 0; const struct svm_node *t3; while( t1->index != -1 ) { // start with the root's son NODE *node = find_son( tree, tree->root, ( unsigned char )t1->value ); DBL_WORD str_len = 0; t3 = t1; tmp = ( lambda * lambda ); while( node != 0 ) { DBL_WORD edge_label_start = node->edge_label_start; DBL_WORD node_label_end = get_node_label_end( tree, node ); // scan the current edge while( str_len < ( DBL_WORD )n && t3->index != -1 && edge_label_start <= node_label_end && tree->tree_string[edge_label_start] == ( unsigned char )t3->value ) { // update score tmp *= ( lambda * lambda ); ++t3; ++str_len; ++edge_label_start; } if( str_len == ( DBL_WORD )n ) { // compute the kernel score count = 0; search_occurrences( node, &count ); count++; ret += ( count * tmp ); node = 0; } else if( ( edge_label_start > node_label_end ) && ( t3->index != -1 ) ) { // reach the end of the current edge, continue to the next edge node = find_son( tree, node, ( unsigned char )t3->value ); } else { node = 0; } } ++t1; } ST_DeleteTree( tree ); return( ret );}double string( const struct svm_node *px, const struct svm_node *py, double degree, double gamma, double coef0 ){ if( gamma == 0.0 ) return( brute_force_full_substring( px, py, ( int )degree, coef0 ) ); else if( gamma == 1.0 ) return( brute_force_substring( px, py, ( int )degree, coef0 ) ); else if( gamma == 2.0 ) return( suffix_tree_full_substring( px, py, ( int )degree, coef0 ) ); else if( gamma == 3.0 ) return( suffix_tree_substring( px, py, ( int )degree, coef0 ) ); else return( 0.0 );}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -