📄 svm_struct_api.c
字号:
void write_label(FILE *fp, LABEL y)
{
/* Writes label y to file handle fp. */
tree t = remove_parent_annotation(y.parse, y.si);
if(y.parse)
write_tree(fp, t, y.si);
else
fprintf(fp,"NULL");
fprintf(fp,"\n");
free_tree(t);
}
void free_pattern(PATTERN x) {
/* Frees the memory of x. */
free(x.sentence.e);
}
void free_label(LABEL y) {
/* Frees the memory of y. */
if(y.parse) free_tree(y.parse);
}
void free_struct_model(STRUCTMODEL sm)
{
/* Frees the memory of model. */
/* if(sm.w) free(sm.w); */ /* this is free'd in free_model */
if(sm.svm_model) free_model(sm.svm_model,1);
/* add free calls for user defined data here */
free_grammar(sm.grammar);
free_vihashl(sm.weightid_ht);
si_free(sm.si);
}
void free_struct_sample(SAMPLE s)
{
/* Frees the memory of sample s. */
int i;
for(i=0;i<s.n;i++) {
free_pattern(s.examples[i].x);
free_label(s.examples[i].y);
}
free(s.examples);
}
void print_struct_help()
{
/* Prints a help text that is appended to the common help text of
svm_struct_learn. */
printf(" --s [1..] -> Maximum length of sentences to use (default 10)\n");
printf(" --p {0,1} -> Use parent annotation for rules (default 0)\n");
printf(" --a {0,1} -> Use borders as feature (default 0)\n");
printf(" --b {0,1} -> Use length of parent span as feature (default 0)\n");
printf(" --c {0,1} -> Use length of childrens spans as features (default 0)\n");
printf(" --d {0,1} -> Use difference between children spans as feature\n");
printf(" (default 0)\n");
printf(" -l {0,1} NOTE: Added loss type '1' in addition to the 0/1-loss '0'.\n");
printf(" Loss type '1' uses (1-F_1) as the loss as in [2].\n");
printf("\n");
printf("The CKY parser is based on the implementation of Mark Johnson described in:\n");
printf(" Johnson, M. (1999). PCFG models of linguistic tree representations.\n");
printf(" Computational Linguistics.\n");
}
void parse_struct_parameters(STRUCT_LEARN_PARM *sparm)
{
/* Parses the command line parameters that start with -- */
int i;
sparm->maxsentlen=10;
sparm->parent_annotation=0;
sparm->feat_borders=0;
sparm->feat_parent_span_length=0;
sparm->feat_children_span_length=0;
sparm->feat_diff_children_length=0;
sparm->si=NULL;
for(i=0;(i<sparm->custom_argc) && ((sparm->custom_argv[i])[0] == '-');i++) {
switch ((sparm->custom_argv[i])[2])
{
case 's': i++; sparm->maxsentlen=atol(sparm->custom_argv[i]); break;
case 'p': i++; sparm->parent_annotation=atol(sparm->custom_argv[i]); break;
case 'a': i++; sparm->feat_borders=atol(sparm->custom_argv[i]); break;
case 'b': i++; sparm->feat_parent_span_length=atol(sparm->custom_argv[i]); break;
case 'c': i++; sparm->feat_children_span_length=atol(sparm->custom_argv[i]); break;
case 'd': i++; sparm->feat_diff_children_length=atol(sparm->custom_argv[i]); break;
default: printf("\nUnrecognized option %s!\n\n",sparm->custom_argv[i]);
exit(0);
}
}
}
void print_struct_help_classify()
{
/* Prints a help text that is appended to the common help text of
svm_struct_classify. */
printf(" --* string -> custom parameters that can be adapted for struct\n");
printf(" learning. The * can be replaced by any character\n");
printf(" and there can be multiple options starting with --.\n");
}
void parse_struct_parameters_classify(char *attribute, char *value)
{
/* Parses one command line parameters that start with -- . The name
of the parameter is given in attribute, the value is given in
value. */
switch (attribute[2])
{
/* case 'x': strcpy(xvalue,value); break; */
default: printf("\nUnrecognized option %s!\n\n",attribute);
exit(0);
}
}
/* ---------------------------------------------------------------- */
SVECTOR *collect_phi(bintree parse, STRUCTMODEL *sm,
size_t lpos, size_t *rpos, size_t start, size_t end)
{
SVECTOR *vecleft=NULL, *vecright=NULL, *sum, *vec;
long weightid, mpos=0;
vindex vi;
vi=make_vindex(4);
vi->e[0]=parse->label;
vi->n=1;
if(parse->left) { /* construct binary rule from tree */
vi->e[1]=parse->left->label;
vi->n=2;
vecleft=collect_phi(parse->left,sm,lpos,rpos,start,end);
if(parse->right) {
vi->e[2]=parse->right->label;
vi->n=3;
mpos=*rpos;
vecright=collect_phi(parse->right,sm,mpos,rpos,start,end);
}
}
else {
*rpos=lpos+1;
}
if(parse->left && parse->right) { /* binary rule */
weightid=vihashl_ref(sm->weightid_ht,vi);
assert(weightid);
vec=phi_brule(weightid,parse->left,parse->right,sm,lpos,mpos,
*rpos,start,end);
vec->next=vecleft; /* prepend vec to vec of left tree */
append_svector_list(vec,vecright);/* concatenate all for preorder list */
sum=vec;
}
else if(parse->left) { /* unary rule */
weightid=vihashl_ref(sm->weightid_ht,vi);
assert(weightid);
vec=phi_urule(weightid,parse->left,sm,lpos,*rpos,start,end);
vec->next=vecleft; /* prepend vec to vec of left tree */
sum=vec;
}
else { /* terminal */
sum=NULL;
}
vindex_free(vi);
return(sum);
}
SVECTOR *phi_urule(long weightid, bintree child, STRUCTMODEL *sm,
int lpos, int rpos, int start, int end)
{
WORD *feat;
SVECTOR *vec;
long basefnum;
long pos;
WORD empty[2];
assert(weightid != 0);
if(weightid <= 0) {
empty[0].wnum=0;
return(create_svector(empty,"",1.0));
}
basefnum=weightid;
feat=(WORD *)MALLOC(sizeof(WORD)*MAXFEAT);
feat[0].wnum=0;
pos=0;
/* base feature */
pos=add_feature(feat,pos,basefnum+0,1.0);
/* borders beginning or end */
if(sm->sparm->feat_borders) {
if(lpos == start)
pos=add_feature(feat,pos,basefnum+1,1.0);
if(rpos == end)
pos=add_feature(feat,pos,basefnum+2,1.0);
}
/* length of span of whole rule */
if((sm->sparm->feat_parent_span_length)
|| (sm->sparm->feat_children_span_length))
pos=encode_number(feat,pos,basefnum+3,rpos-lpos,1.0,2,3,4,5,7,9);
assert(pos<MAXFEAT);
vec=create_svector(feat,"",1.0);
free(feat);
return(vec);
}
SVECTOR *phi_brule(long weightid, bintree left, bintree right, STRUCTMODEL *sm,
int lpos, int mpos, int rpos, int start, int end)
{
WORD *feat;
SVECTOR *vec;
long basefnum;
long pos;
WORD empty[2];
assert(weightid != 0);
if(weightid <= 0) {
empty[0].wnum=0;
return(create_svector(empty,"",1.0));
}
basefnum=weightid;
feat=(WORD *)MALLOC(sizeof(WORD)*(MAXFEAT+1));
feat[0].wnum=0;
pos=0;
/* base feature */
pos=add_feature(feat,pos,basefnum+0,1.0);
/* borders beginning or end */
if(sm->sparm->feat_borders) {
if(lpos == start)
pos=add_feature(feat,pos,basefnum+1,1.0);
if(rpos == end)
pos=add_feature(feat,pos,basefnum+2,1.0);
}
/* length of span of whole rule */
if(sm->sparm->feat_parent_span_length)
pos=encode_number(feat,pos,basefnum+3,rpos-lpos,1.0,2,3,4,5,7,9);
/* length of span of each child */
if(sm->sparm->feat_children_span_length) {
pos=encode_number(feat,pos,basefnum+9,rpos-mpos,1.0,2,3,4,5,7,9);
pos=encode_number(feat,pos,basefnum+15,mpos-lpos,1.0,2,3,4,5,7,9);
}
/* difference between left and right child */
if(sm->sparm->feat_diff_children_length)
pos=encode_number(feat,pos,basefnum+21,(rpos-mpos)-(mpos-lpos),1.0,-7,-3,-1,1,3,7);
assert(pos<MAXFEAT);
vec=create_svector(feat,"",1.0);
free(feat);
return(vec);
}
int add_feature(WORD *feat, int pos, long fnum, long weight)
/* add feature fnum with value weight to end of feature list */
{
/* find end of feature list */
for(pos=0;feat[pos].wnum>0;pos++);
feat[pos].wnum=fnum;
feat[pos].weight=weight;
pos++;
feat[pos].wnum=0;
return(pos);
}
int encode_number(WORD *feat,int pos,long basefnum,long number,double weight,
long a, long b, long c, long d, long e, long f)
/* encodes a number as a vector of 6 featurs by thresholding it
against a,b,c,d,e,f */
{
/* find end of feature list */
for(pos=0;feat[pos].wnum>0;pos++);
if(number <= a)
pos=add_feature(feat,pos,basefnum+0,weight);
if(number <= b)
pos=add_feature(feat,pos,basefnum+1,weight);
if(number <= c)
pos=add_feature(feat,pos,basefnum+2,weight);
if(number <= d)
pos=add_feature(feat,pos,basefnum+3,weight);
if(number <= e)
pos=add_feature(feat,pos,basefnum+4,weight);
if(number <= f)
pos=add_feature(feat,pos,basefnum+5,weight);
return(pos);
}
double urule_value(urule rule, bintree child, STRUCTMODEL *sm,
int lpos, int rpos, int start, int end)
{
double val;
SVECTOR *vec;
DOC *ex;
assert(rule->weightid >= -1);
assert(rule->weightid != 0);
assert(rule->weightid < sm->sizePsi);
/* printf("urule: %ld\n",rule->weightid); */
if(rule->weightid == -1) {
return(0.0);
}
else {
vec=phi_urule(rule->weightid,child,sm,lpos,rpos,start,end);
ex=create_example(-1,-1,-1,1,vec);
val=classify_example(sm->svm_model,ex);
free_example(ex,1);
return(val);
}
}
double brule_value(brule rule, bintree left, bintree right, STRUCTMODEL *sm,
int lpos, int mpos, int rpos, int start, int end)
{
double val;
SVECTOR *vec;
DOC *ex;
assert(rule->weightid >= -1);
assert(rule->weightid != 0);
assert(rule->weightid < sm->sizePsi);
/* printf("brule: %ld\n",rule->weightid); */
if(rule->weightid == -1) {
return(0.0);
}
else {
vec=phi_brule(rule->weightid,left,right,sm,lpos,mpos,rpos,start,end);
ex=create_example(-1,-1,-1,1,vec);
val=classify_example(sm->svm_model,ex);
free_example(ex,1);
return(val);
}
}
void count_local_trees(const tree t, vihashl localtree_ht)
{
si_index e[MAXRHS];
struct vindex vi = {0, e};
tree p;
if (!t || !t->subtrees) return;
vi.n = 0;
vi.e[vi.n++] = t->label;
for(p = t->subtrees; p; p=p->sibling) {
assert(vi.n < MAXRHS);
vi.e[vi.n++] = p->label;
count_local_trees(p, localtree_ht);
}
vihashl_inc(localtree_ht, &vi, 1); /* save this list */
}
void write_local_trees(FILE *fp,const vihashl localtree_ht, si_t si)
{
vihashlit hit;
vindex vi;
long count;
size_t i;
char *string;
for (hit = vihashlit_init(localtree_ht); vihashlit_ok(hit); hit = vihashlit_next(hit)) {
vi = (vindex) hit.key;
assert(vi->n > 0);
assert(vi->n <= MAXRHS);
count = hit.value;
string = si_index_string(si, vi->e[0]);
assert(string);
fprintf(fp,"%ld\t%s " REWRITES, count, string);
for (i=1; i<vi->n; i++) {
string = si_index_string(si, vi->e[i]);
assert(string);
fprintf(fp," %s", string);
}
fprintf(fp,"\n");
}}
int tree_eq(const tree t1, const tree t2)
{
tree p1,p2;
if ((!t1) && (!t2)) return(1);
if ((!t1) || (!t2)) return(0);
if(t1->label == t2->label) {
for(p1 = t1->subtrees,p2 = t2->subtrees; p1 || p2; p1=p1->sibling,p2=p2->sibling) {
if(!tree_eq(p1,p2))
return(0);
}
}
else
return(0);
return(1);
}
/* ------- parser -------- */
chart
chart_make(size_t n)
{
size_t i, nn = CHART_SIZE(n);
chart c = MALLOC(nn*sizeof(sihashcc));
for (i=0; i<nn; i++)
c[i] = NULL; /* chart cell will be constructed in apply_unary() */
return c;
}
void
chart_free(chart c, size_t n)
{
size_t i;
for (i=0; i<CHART_SIZE(n); i++)
free_sihashcc(c[i]);
FREE(c);
}
void
chart_entry_display(FILE *fp, sihashcc chart_entry, si_t si)
{
sihashccit hit;
tree t;
for (hit=sihashccit_init(chart_entry); sihashccit_ok(hit);
hit = sihashccit_next(hit)) {
fprintf(fp, "\n %s: %g \t", si_index_string(si, hit.key),
(double) hit.value->prob);
t = bintree_tree(&hit.value->tree, si);
write_tree(fp, t, si);
free_tree(t);
if(hit.value->present2) {
fprintf(fp, "\t : %g \t", (double) hit.value->prob2);
t = bintree_tree(&hit.value->tree2, si);
write_tree(fp, t, si);
free_tree(t);
}
}
fprintf(fp, "\n---\n");
}
int is_urule_cycle(bintree tree, si_index label, si_t si)
{
if(!tree) /* is empty tree */
return(0);
else if(remove_parent_from_label(tree->label, si)
== remove_parent_from_label(label,si))
/* already member of unary chain */
return(1);
else if(tree->right) /* not produced by a unary rule */
return(0);
else {
return(is_urule_cycle(tree->left,label,si));
}
}
int
add_edge_zeroone(sihashcc chart_entry, si_index label,
bintree left, bintree right,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -