⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 makedict.pl

📁 PostgreSQL7.4.6 for Linux
💻 PL
字号:
#!/usr/bin/perluse strict;use Getopt::Std;use locale;my %opt; getopts('l:he:s:ap:o:m:f', \%opt);if ( $opt{h} || ! ($opt{e}||$opt{s}) || !$opt{l} ) {	print<<EOT;Generator of variant of the Lovin's stemmer which uses a longest match algorithm.Author Teodor Sigaev <teodor\@stack.net>Usage:$0 -l LOCALENAME [ -e FILENAME ] [ -s FILENAME ] [ -p PREFIX ] [ -o FILENAME ] [ -a ] [ -m NUMBER ]-e FILENAME	- file with endings of word-s FILENAME	- file with list of stop-word-o FILENAME	- out file, default STDOUT-a 		- stop-word are strimmed-p PREFIX	- prefix of function and etc, default strimmed locale-m NUMBER	- minimal length of rest after semming, default 3-l LOCALENAME	- name of locale-f		- do not call tolower for each charAt least one of -e or -s must be definedEOTexit; }if ( ! defined $opt{p} ) {	$opt{p} = $opt{l};	$opt{p}=~s/[^a-zA-Z0-9_]+//g;}$opt{m}=3 if ! defined $opt{m};my ($enddata,$stopdata) = ('','');my $maxchild = 0;if ( $opt{e} ) {	my @tree;	buildtree(\@tree,  $opt{e}, 1);	printstruct( \@tree,  0, \$enddata);	undef @tree;} 	if ( $opt{s} ) {	my @tree;	buildtree(\@tree,  $opt{s}, 0);	printstruct( \@tree,  0, \$stopdata);	undef @tree;} 	die "No data\n" if ( ! (length $enddata || length $stopdata) );$enddata = "\t{0,0,0,0}" if ( ! length $enddata );$stopdata = "\t{0,0,0,0}" if ( ! length $stopdata );my $fh=\*STDOUT;if ( $opt{o} ) {	open(OUT,">$opt{o}") || die "Can;t open file '$opt{o}' for writing\n";	$fh = \*OUT;}my $linktype = 'uint32';if ( $maxchild <= 0xff ) {        $linktype='uint8';} elsif (  $maxchild <= 0xffff ) {        $linktype='uint16';}my $wherecheck = ( $opt{a} ) ?		"NULL,\n\t$opt{p}_is_stopword"	: 		"$opt{p}_is_stopword,\n\tNULL";my ($tolower, $resttolower) = ('','');if ( ! $opt{f} ) {	$tolower = '*cur = tolower( *cur );';	$resttolower=<<EOT;	while( cur - buf >= 0 ) {		*cur = tolower(*cur);		cur--;	}EOT}print {$fh} <<EOT;/* * Autogenerated file * * Variant of the Lovin's stemmer which uses a longest match algorithm. * Endings are stored in a suffix tree. */#ifdef DICT_BODY#include <ctype.h>typedef struct {	uint8	val;	uint8	flag;	uint8	right;	$linktype	child;} $opt{p}_NODE;/* is exists left tree ? */#define L       0x01/* finish word flag */#define F       0x02#define ISLEFT(x)       ((($opt{p}_NODE*)x)->flag & L)#define ISFINISH(x)     ((($opt{p}_NODE*)x)->flag & F)#define MINLENREST	$opt{m}static $opt{p}_NODE $opt{p}_endstree[]={$enddata};static $opt{p}_NODE $opt{p}_stoptree[]={$stopdata};static char*$opt{p}_stem( void* obj, char *in, int *len ) {	$opt{p}_NODE    *ptr = $opt{p}_endstree;	int     result = 0;	uint8 *buf = (uint8 *)in;	uint8 *cur = buf + (*len) - 1;	while( cur - buf >= MINLENREST ) {		$tolower		if ( ptr->val == *cur ) {			if ( ISFINISH(ptr) ) result = buf + (*len) - cur;			cur--;			if ( ! ptr->child ) break;			ptr += ptr->child;		} else if ( ptr->val > *cur ) {			if ( ISLEFT(ptr) )				ptr++;			else				break;		} else {			if ( ptr->right )				ptr += ptr->right;			else				break;		}	}	$resttolower	*len -= result;	return in;}static int$opt{p}_is_stopword( void *obj, char *in, int len ) {        $opt{p}_NODE    *ptr = $opt{p}_stoptree;        int     result = 0;        uint8 *buf = (uint8 *)in;        uint8 *cur = buf;        while( cur - buf < len ) {		$tolower                if ( ptr->val == *cur ) {                        cur++;                        if ( ISFINISH(ptr) ) result = cur - buf;                        if ( ! ptr->child ) break;                        ptr += ptr->child;                } else if ( ptr->val > *cur ) {                        if ( ISLEFT(ptr) )                                ptr++;                        else                                break;                } else {                        if ( ptr->right )                                ptr += ptr->right;                        else                                break;                }        }        return (result==len) ? 1 : 0;}#undef L#undef F#undef ISLEFT#undef ISFINISH#undef MINLENREST #endif /* DICT_BODY */#ifdef DICT_TABLETABLE_DICT_START	\"$opt{l}\",	NULL,	NULL,	$opt{p}_stem,	$wherecheckTABLE_DICT_END#endifEOTclose($fh) if ( $fh != \*STDOUT );sub buildtree {	my ($reftree,$file, $needreverse) = @_;	open(DATA,$file) || die "Can't open file '$file'\n";	while(<DATA>) {		chomp;		next if ! length $_;		$_ = lc($_) if ! $opt{f};		addtostruct( $reftree, ( $needreverse ) ? scalar(reverse($_)) : $_ );	}	close DATA;}sub mkbintree {	my ( $start, $stop, $rprop, $rres) = @_;        	my $middle = $start + int( ($stop-$start)/2 );	push( @$rres, $rprop->[$middle] );	my $idx = $#$rres;	$rres->[$idx]{right}=0;	$rres->[$idx]{left}=0;	return 1 if ( $start == $stop );	my $leftsize = 0;	if ( $middle!=$start ) {		$rres->[$idx]{left}=1;  		$leftsize = mkbintree( $start, $middle-1, $rprop, $rres );		$rres->[$idx]{right}=$leftsize+1;	} else {		$rres->[$idx]{right} = 1;	}	return 1 + $leftsize + mkbintree( $middle+1, $stop, $rprop, $rres );}sub addtostruct {	my $node = shift;	my ($char, $subval) = split('', shift, 2);	$char = ord( $char );	if ( ! defined $node->[$char] ) {		$node->[$char] = {};		$node->[$char]{finish} = length $subval;		$node->[$char]{child} = [];	} elsif ( ! length $subval ) {		$node->[$char]{finish} = 0;	}	addtostruct( $node->[$char]{child}, $subval ) if ( length $subval );}sub printstruct {	my ($node, $pre, $refout) = @_;	my $add = 0;	my @prop;	my $outchild;	my $current = 0;	my $poschild=0;	my @tmp;	foreach my $i ( 0..255 ) {		next if ( !defined $node->[ $i ] );		push @prop , { val=>$i,			nchild=>printstruct( $node->[ $i ]{child}, 1, \$outchild ),			poschild=>$poschild };		$poschild += $prop[$#prop]{nchild};	}	return 0 if $#prop < 0;	if ($pre) {		$$refout .= ",\n\n";	}	mkbintree(0,$#prop,\@prop,\@tmp);	@prop = @tmp;		$current=$#prop+1;	foreach my $i ( 0..$#prop ) {		my $flag = ($prop[$i]{left}) ? 'L' : undef;		if ( $node->[ $prop[$i]{val} ]{finish}==0 ) {			$flag .= '|' if defined $flag;			$flag .= 'F';		} elsif ( ! defined $flag ) {			$flag='0';		}		$$refout .= "\t{'".chr( $prop[$i]{val} )."',".				$flag.','.				$prop[$i]{right}.','.				(($prop[$i]{nchild}==0)?0:($prop[$i]{poschild}+$current)).'}'.				(($i==$#prop)? '' : ",\n");		$maxchild = $prop[$i]{poschild}+$current if			( $prop[$i]{nchild} && $prop[$i]{poschild}+$current > $maxchild );		$current--;		$add += $prop[$i]{nchild};	}	$$refout .= $outchild;	return $#prop+1 + $add;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -