📄 chart.c
字号:
/* This code is the statistical natural language parser described in M. Collins. 1999. Head-Driven Statistical Models for Natural Language Parsing. PhD Dissertation, University of Pennsylvania. Copyright (C) 1999 Michael Collins This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/#include <assert.h>#include "chart.h"#define CCPROBSMALL 0.0000000000000000001/* pointer to the current sentence being parsed*/sentence_type *current;/* initialise the chart */void init_chart();/* add a sentence to the chart (starts the parsing process) */void add_sentence_to_chart(sentence_type *sentence);/*complete the chart for words spanning s..e inclusive*/void complete(int s,int e);/* edges is an array of edges in the chart childs is an array used for the children of each node in the chart: childs[edge[i].child1] to childs[edge[i].child1+edge[i].numchild-1] are the children of the i'th edge in left to right order (being indexes into the edges array themselves) */#define PMAXEDGES 200000#define PMAXCHILDS 3000000edge_type edges[PMAXEDGES];int numedges;int childs[PMAXCHILDS];int numchilds;/* The edges in the chart spanning words s to e inclusive are contained in edges[sindex[s][e]] to edges[eindex[s][e]] inclusive if no edges have been added to the chart for the span s..e, sindex[s][e]==-1, eindex[s][e]==-2 */int sindex[PMAXWORDS][PMAXWORDS];int eindex[PMAXWORDS][PMAXWORDS];void init_index();/* add_edge(s,e) adds an edge spanning words s...e inclusive the edge is at edges[numedges], rather than being passed in as an argument returns an index into edges array which points to where the edge has been added returns -1 if the edge isn't added, i.e. because it fails the dynamic programming/beam conditions */int add_edge(int s,int e);/* saments is a data structure that supports add_edge saments allows edges in the chart with the same non-terminal label to be stored in a linked list, by pointing to the first element of the linked list. This helps efficiency of the dynamic programming algorithm (when checking to see if there's an edge in the chart with the same label, head word etc. it's sufficient to look down the linked list for that non-terminal) saments also facilitates keeping only the top PMAXHEADS edges with label i for each i (another kind of beam which turns out not to be all that useful, but I left the code in anyway) saments contains information for non-terminals in the current span being worked on in the chart: at each point when a new span is started saments is re-initialised saments[i] contains the following info for non-terminal i: edge1 is the first edge in the chart with label i. edges[i].next points to the next element in the linked list, and so on minedge is the edge in the list with the lowest probability minprob is the prob of the lowest prob edge numedges is the number of edges with the i'th non-terminal (so once numedges==PMAXHEADS, when adding an edge first test if its prob is greater than minprob, if so replace minedge and update minedge/minprob)*/typedef struct { int edge1; int minedge; double minprob; char numedges;} sament_type;sament_type saments[GMAXNTS];void init_saments();void find_sament_minprob(int nt);#define PMAXHEADS 100/* join_2_edges_follow joins two edges where the modifier follow the head: (the new edge is added using add_edge) P P | => | H ... R H ... R edge1 edge2 newedgeprob = P1 P2 P1*P2*Prob_r(R | P,H) where edge1 spans words s..m inclusive edge2 spans words m+1..e inclusive edge1 is the e1'th element of the edges array edge2 is the i'th element of the edges array for all i=e2s[0] .. e2s[ne2s-1] so there are ne2s joining operations: they are all done in one call for efficiency reasons */void join_2_edges_follow(int e1,int s,int m,int e,int *e2s,int ne2s);/* join_2_edges_precede joins two edges but where the modifier precedes the head: (the new edge is added using add_edge) P P | => | L ... H L ... H edge1 edge2 newedge edge2 is the e1'th element of the edges array edge1 is the i'th element of the edges array for all i=e1s[0] .. e1s[ne1-1] so there are ne1 joining operations: they are all done in one call for efficiency reasons*/void join_2_edges_precede(int e2,int s,int m,int e,int *e1s,int ne1);/* join_2_edges_cc joins two edges in a coordination relationship: (the new edge is added using add_edge) P P | => | H ... CC R H ... CC R edge1 edge2 newedge where edge1 spans words s..m inclusive edge2 spans words m+2..e inclusive word m+1 is tagged as a CC by the POS tagger edge1 is the e1'th element of the edges array edge1 is the e2'th element of the edges array */void join_2_edges_cc(int e1,int e2,int s,int m,int e);/* add_singles add all the unary rules for words spanning s..e: P | H => H edge1 newedge prob = P1 prob = P1*Prob_h(H | P) where edge1 = i'th edge in the edges array for si<=i<=ei*/void add_singles(int s,int e,int si,int ei);/* add_stops adds all the stop probabilities for edges which do not yet have these added (i.e. for all s<=i<=e and edges[i].stop==0 a new edge is added with edges[newedge].stop==1 and the two stop probabilities multiplied in*/void add_stops(int s,int e,int si,int ei);/* add_traces adds all traces for edges that have both a gap and an NP requirement to either their left or right*/void add_traces(int s,int e);/* add_singles_stops(s,e) adds all stop probabilities for edges spanning s..e, then builds unary rules on top these, then adds stop probs for these unary rules, then builds unary rules on top of these, and so on and so on...*/void add_singles_stops(int s,int e);/* adds are data-structures are used by add_singles_stops */int ADDFLAG;int adds1[100000];int numadds1;int adds2[100000];int numadds2;int *adds;int *numadds;/* calc_prob2 calculates prob2 for edges[edge] i.e. it sets edges[edge].prob2 = edges[edge].prob + prior where prior is an estimate of Prob(word,non-terminal,tag) for the edge, an estimate of how likely a non-terminal Prob(word,non-terminal,tag) is to appear in the correct tree */void calc_prob2(int edge);/* prints an edge tabbed in by offset spaces */void print_edge(int e,int offset);/*bestprobs[s][e] holds the highest probability for all edges added spanning words s..e it's used for pruning edges from the chart note that it is continually updated as edges are added for section s..e the thresholding is used at two stages: first while the edges for s..e are being constructed -- at this stage some edges will make it through the beam in spite of being outside the beam once all the edges have been added (a high probability edge may be added later than a low probability edge, and make it invalid); second, when the edges are being built on an initial check is made to see if they fall in the beam */double bestprobs[GMAXNTS][GMAXNTS];/* returns 1 if the edge is within the current beam. Used while the edges for s..e in the chart are being added */int inbeam(int edge,int s,int e);/* returns 1 if the edge is within the current beam. Also sets edges[edge].inbeam indicating whether or not the edge makes the beam. Used when s..e is being built upon, at which point edges can definitely be classified as falling in/out of the beam */int inbeam2(int edge,int s,int e);int TREEBANKOUTPUTFLAG;void set_treebankoutputflag(int flag){ TREEBANKOUTPUTFLAG = flag;}/*==========================================================================*//*parse a sentence, print the output to stdout*/void parse_sentence(sentence_type *sentence){ int i,j; reset_prior_hashprobs(); effhash_newsent(&eff_hash); init_chart(GMAXNTS); current = sentence; add_sentence_to_chart(current); for(i=0;i<current->nws_np;i++) for(j=i-1;j>=0;j--) { complete(j,i); } if(print_best_parse()!=-1) return; printf("PROB 0 0 0\n"); print_noparse(current);}void init_index(){ int i,j; for(i=0;i<PMAXWORDS;i++) for(j=0;j<PMAXWORDS;j++) { sindex[i][j]=-1; eindex[i][j]=-2; }}void init_saments(){ int i; for(i=0;i<GMAXNTS;i++) saments[i].edge1=-1;}void init_chart(){ init_index(); numadds = &numadds1; adds = adds1; numadds1 = 0; init_saments(); numedges=0; numchilds=0;}/* note the following bug: for singles, an edge can be replaced when another edge is built on top of it. For example, SBAR -> S is added, then the S is replaced */int add_edge(int s,int e){ int i,j; int nt; /* if type!=0, i.e. not a part of speech tag, return -1 if the edge doesn't make the threshold pthresh even before its prior prob is added */ if(edges[numedges].type!=0) { if(edges[numedges].prob<pthresh) return -1; } /*inbeam=2 means we haven't checked whether the edge falls within the beam or not yet, as not all edges have been added for span s...e*/ edges[numedges].inbeam=2; calc_prob2(numedges); if(sindex[s][e]==-1) /*first edge to be added for this span*/ { if(edges[numedges].type==0) bestprobs[s][e]=-10000; /*don't count POS tags in beam comparisons*/ else bestprobs[s][e]=edges[numedges].prob2; /*bestprobs is set to prob2, currently the highest probability edge spanning words s..e*/ } else if(edges[numedges].label==NT_TOP) { /*NT_TOP can only span the whole sentence (it's the start symbol)*/ if(s!=0 || e!=current->nws_np-1) return -1; } else { /*return if not a POS tag and not in the beam (note: POS tags are always kept)*/ if(edges[numedges].type!=0&&!inbeam(numedges,s,e)) return -1; /*update bestprobs if the prob of this new edge beats the current high for span s..e*/ else if(edges[numedges].prob2 > bestprobs[s][e] && edges[numedges].type!=0) bestprobs[s][e] = edges[numedges].prob2; } nt=edges[numedges].label; /*case 1: saments[nt].edge== -1, indicating that this is the first edge with label nt to be added to span s..e of the chart*/ if(saments[nt].edge1==-1) { /*initialise saments to point to this edge*/ saments[nt].edge1=numedges; saments[nt].minedge=numedges; saments[nt].minprob=edges[numedges].prob; saments[nt].numedges=1; /*there are no other edges with label nt in this span, so the next element in the linked list of edges with label nt is null */ edges[numedges].next=-1; /*if it's the first edge in the chart of *any* label then initialise sindex[s][e]*/ if(sindex[s][e]==-1) sindex[s][e]=numedges; eindex[s][e]=numedges; /*note that an edge has been added*/ adds[*numadds] = numedges; (*numadds)++; ADDFLAG=1; numedges++; /*return a pointer to the edge*/ return numedges-1; } /*Case 2: it's not the first edge with label nt to be added, but saments[nt].numedges<PMAXHEADS indicates that the beam for label nt is not yet full (only the top PMAXHEADS non-terminals with this label are kept)*/ if(saments[nt].numedges<PMAXHEADS) { i=saments[nt].edge1; /*go down the linked list, if a matching edge is found then the dynamic programming step kicks in: either replace the old edge if it has lower probability, otherwise return without adding anything */ while(i!=-1) { if(equal_edges(&edges[i],&edges[numedges])) if(edges[numedges].prob>edges[i].prob) { j=edges[i].next; edges[i]=edges[numedges]; edges[i].next=j; find_sament_minprob(nt); adds[*numadds] = i; (*numadds)++; ADDFLAG=1; return i; } else return -1; j=i; i=edges[i].next; } /* at this stage no matching non-terminals have been found. add the new edge at position edges[numedges] edges[j] was the last non-terminal with label nt: edges[j].next=numedges adds the new edge to the end of the linked list */ edges[j].next=numedges; edges[numedges].next=-1; saments[nt].numedges++; if(edges[numedges].prob<saments[nt].minprob) {saments[nt].minprob=edges[numedges].prob;saments[nt].minedge=numedges;} adds[*numadds] = numedges; (*numadds)++; eindex[s][e]=numedges; numedges++; ADDFLAG=1; return numedges-1; } /*Case 3: there are already PMAXHEADS edges with label nt*/ /*if the new edge has lower probability than the lowest in the linked list, don't add it*/ if(edges[numedges].prob<saments[nt].minprob) return -1; /*otherwise check the dynamic programming case again*/ i=saments[nt].edge1; while(i!=-1) { if(equal_edges(&edges[i],&edges[numedges])) if(edges[numedges].prob>edges[i].prob) { j=edges[i].next; edges[i]=edges[numedges]; edges[i].next=j; find_sament_minprob(nt); adds[*numadds] = i; (*numadds)++; ADDFLAG=1; return i; } else return -1; i=edges[i].next; } /*if dynamic programming does not reject/accept the new edge, replace the lowest probability edge in the linked list with the new edge*/ i=saments[nt].minedge; j=edges[i].next; edges[i]=edges[numedges]; edges[i].next=j; find_sament_minprob(nt); adds[*numadds] = i; (*numadds)++; ADDFLAG=1; return i;}/*update the values for sament*/void find_sament_minprob(int nt){ int i; i=saments[nt].edge1; saments[nt].minprob=edges[i].prob; saments[nt].minedge=i; i=edges[i].next; while(i!=-1) { if(edges[i].prob<saments[nt].minprob) {saments[nt].minprob=edges[i].prob;saments[nt].minedge=i;} i=edges[i].next; }}void add_sentence_to_chart(sentence_type *sentence){ int i,j,k; char *td; int word; char temptd[GMAXNTS]; int flag; for(i=0;i<sentence->nws_np;i++) { init_saments(); word=sentence->wordnos[i]; flag=0; if(fwords[word]!=GUNKNOWN) { for(j=0;j<GMAXNTS;j++) if(tagdict[fwords[word]][j]==1) flag=1; } if(flag==0||!ALLTAGS) { td=temptd; for(j=0;j<GMAXNTS;j++) temptd[j]=0; temptd[sentence->tagnos[i]]=1; } else td=tagdict[fwords[word]]; for(j=0;j<100;j++) if(td[j]==1) { edges[numedges].head=i; edges[numedges].headtag=j; edges[numedges].headch=-1; edges[numedges].prob=0; edges[numedges].prob2=0; edges[numedges].label=j; edges[numedges].type=0; edges[numedges].valid=1; edges[numedges].stop=1; edges[numedges].headlabel=0; clear_args(&edges[numedges].lc); clear_args(&edges[numedges].rc); edges[numedges].lc.nvs=0; edges[numedges].lc.adj=1; edges[numedges].rc.nvs=0; edges[numedges].rc.adj=1; edges[numedges].hasverb = isverb(j);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -