⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 grammar_ml.c

📁 SVMcfg: Learns a weighted context free grammar from examples. Training examples (e.g. for natural la
💻 C
字号:
/* grammar.c
 */

#include "grammar.h"
#include "tree.h"
#include "hash-templates.h"
#include "mmm.h"
#include <stdio.h>
#include <ctype.h>
#include <math.h>

#define MAX(x,y)	((x) < (y) ? (y) : (x))

HASH_CODE_ADD(sihashst, si_index, size_t, IDENTITY, NEQ, IDENTITY, NO_OP, 0, NO_OP)

/*
static brule
make_brule(const FLOAT prob, const si_index parent, const si_index left, const si_index right)
{
  brule br = MALLOC(sizeof(struct brule));
  br->prob = prob;
  br->parent = parent;
  br->left = left;
  br->right = right;
  return br;
}
*/

void
brules_free(brules brs)
{
  size_t i;
  for (i=0; i<brs.n; i++)
    FREE(brs.e[i]);
  FREE(brs.e);
}

static brules brules_empty = {NULL, 0, 0};

HASH_CODE(sihashbrs, si_index, brules, IDENTITY, NEQ, IDENTITY, NO_OP, brules_empty, brules_free)

typedef struct {
  si_index parent, left, right;
} brindex;

size_t brindex_hash(brindex bri)
{
  return (2*bri.parent+3*bri.left+5*bri.right);
}

int
brindex_neq(const brindex bri1, const brindex bri2)
{
  return ((bri1.parent != bri2.parent) || (bri1.left != bri2.left) || (bri1.right != bri2.right));
}

HASH_HEADER(brihashbr, brindex, brule)
HASH_CODE(brihashbr, brindex, brule, brindex_hash, brindex_neq, IDENTITY, NO_OP, NULL, NO_OP)

static void
add_brule(sihashbrs left_brules_ht, brihashbr brihtbr, 
	  const FLOAT prob, const si_index parent, const si_index left, const si_index right)
{
  brindex bri;
  brule br;
  brules *brsp;

  bri.parent = parent;
  bri.left = left;
  bri.right = right;
  br = brihashbr_ref(brihtbr, bri);

  if (br) {			/* have we seen this rule before? */
    br->prob += prob;		/* yes */
    return;
  }
  
  br = MALLOC(sizeof(struct brule));	/* make new brule */
  br->prob = prob;
  br->parent = parent;
  br->left = left;
  br->right = right;
  
  brsp = sihashbrs_valuep(left_brules_ht, left);
  
  if (brsp->nsize <= brsp->n) {
    brsp->nsize = MAX(2*brsp->nsize, 8);
    brsp->e = REALLOC(brsp->e, brsp->nsize * sizeof(brsp->e[0]));
  }

  assert(brsp->n < brsp->nsize);
  brsp->e[(brsp->n)++] = br;
  brihashbr_set(brihtbr, bri, br);
}


static urule
make_urule(const FLOAT prob, const si_index parent, const si_index child)
{
  urule ur = MALLOC(sizeof(struct urule));
  ur->prob = prob;
  ur->parent = parent;
  ur->child = child;
  return ur;
}

static void
urules_free(urules urs)
{
  size_t i;
  for (i=0; i<urs.n; i++)
    FREE(urs.e[i]);
  FREE(urs.e);
}

static urules urules_empty = {NULL, 0, 0};

HASH_CODE(sihashurs, si_index, urules, IDENTITY, NEQ, IDENTITY, NO_OP, urules_empty, urules_free)

static void
push_urule(sihashurs child_urules_ht, const si_index key, const urule ur)
{
  urules *ursp = sihashurs_valuep(child_urules_ht, key);
  
  if (ursp->nsize <= ursp->n) {
    ursp->nsize = MAX(2*ursp->nsize, 8);
    ursp->e = REALLOC(ursp->e, ursp->nsize * sizeof(ursp->e[0]));
  }

  assert(ursp->n < ursp->nsize);
  ursp->e[(ursp->n)++] = ur;
}

si_index 
read_cat(FILE *fp, si_t si)
{
  char string[MAXLABELLEN];
  int    c;
  size_t i;

  while ((c = fgetc(fp)) && isspace(c) && (c != '\n'))		/* skip spaces */
    ;

  if ((c == '\n') || (c == EOF)) return(0);			/* line ended, return 0 */

  for (i = 0; (c != EOF) && (!isspace(c)) && (i < MAXLABELLEN); c = fgetc(fp)) 
    string[i++] = c;

  ungetc(c, fp);

  if (i >= MAXLABELLEN) {
    string[MAXLABELLEN-1] = '\0';
    fprintf(stderr, "read_cat() in grammar.c: Category label longer than MAXLABELLEN: %s\n", string);
    exit(EXIT_FAILURE);
  }

  string[i] = '\0';
  return(si_string_index(si, string));
}
  
  
grammar
read_grammar(FILE *fp, si_t si) 
{
  sihashbrs left_brules_ht = make_sihashbrs(NLABELS);
  sihashurs child_urules_ht = make_sihashurs(NLABELS);
  sihashst parent_count_ht = make_sihashst(NLABELS);
  brihashbr brihtbr = make_brihashbr(NLABELS);
  int n, count;
  urule ur;
  sihashbrsit bhit;
  sihashursit uhit;
  size_t  lhs, cat, rhs[MAXRHS];

  while ((n = fscanf(fp, "%d ", &count)) == 1) {	/* read the count */
    lhs = read_cat(fp, si);
    assert(lhs);
    
    fscanf(fp, " " REWRITES);				/* read the rewrites symbol */

    for (n=0; n<MAXRHS; n++) {				/* read the rhs, n is length of rhs */
      cat = read_cat(fp, si);
      if (!cat)
	break;
      rhs[n] = cat;
    }

    if (n >= MAXRHS) {
      fprintf(stderr, "read_grammar() in grammar.c: rule rhs too long\n");
      exit(EXIT_FAILURE);
    }

    switch (n) {
    case 0: 
      fprintf(stderr, "read_grammar() in grammar.c: rule with empty rhs\n");
      exit(EXIT_FAILURE);
      break;
    case 1: 
      ur = make_urule(count, lhs, rhs[0]);
      push_urule(child_urules_ht, ur->child, ur);
      sihashst_inc(parent_count_ht, ur->parent, count);
      break;
    case 2:
      add_brule(left_brules_ht, brihtbr, count, lhs, rhs[0], rhs[1]);
      sihashst_inc(parent_count_ht, lhs, count);
      break;
    default: 
      { int start, i, j;
        char bcat[MAXBLABELLEN], *s;
	si_index bparent, left, right;

	right = rhs[n-1];		/* rightmost category */
	for (start=n-2; start>=1; start--) {
	  
	  i = 0;			/* i is index into bcat[] */
	  for (j=start; j<n; j++) {     /* j is index into rhs[] */
	    if (j!=start) {
	      bcat[i++] = BINSEP;
	      assert(i < MAXBLABELLEN);
	    }
	    
	    s = si_index_string(si, rhs[j]);
	    while (*s) {
	      bcat[i++] = *s++;
	      assert(i < MAXBLABELLEN);
	  }}

	  bcat[i] = '\0';
	  bparent = si_string_index(si, bcat);
	  left = rhs[start];
	  add_brule(left_brules_ht, brihtbr, count, bparent, left, right);
	  sihashst_inc(parent_count_ht, bparent, count);
	  right = bparent;
	}
	
	add_brule(left_brules_ht, brihtbr, count, lhs, rhs[0], right);
	sihashst_inc(parent_count_ht, lhs, count);
      }}}
  
  free_brihashbr(brihtbr);	/* free brindex hash table */

  { 
    int i; /* normalize grammar rules */

    for (bhit = sihashbrsit_init(left_brules_ht); sihashbrsit_ok(bhit); bhit = sihashbrsit_next(bhit))
      for (i=0; i<bhit.value.n; i++) 
	bhit.value.e[i]->prob = log(bhit.value.e[i]->prob / sihashst_ref(parent_count_ht, bhit.value.e[i]->parent));

    for (uhit = sihashursit_init(child_urules_ht); sihashursit_ok(uhit); uhit = sihashursit_next(uhit))
      for (i=0; i<uhit.value.n; i++) 
	uhit.value.e[i]->prob = log(uhit.value.e[i]->prob / sihashst_ref(parent_count_ht, uhit.value.e[i]->parent));
  }
  
  free_sihashst(parent_count_ht);
 
  {
    grammar g;
    g.urs = child_urules_ht;
    g.brs = left_brules_ht;
    return g;
  }
}

void 
write_grammar(FILE *fp, grammar g, si_t si) 
{
  sihashbrsit	bhit;
  sihashursit	uhit;
  size_t	i;

  for (bhit=sihashbrsit_init(g.brs); sihashbrsit_ok(bhit); bhit=sihashbrsit_next(bhit)) 
    for (i=0; i<bhit.value.n; i++) 
      fprintf(fp, "%g	%s " REWRITES " %s %s\n", (double) bhit.value.e[i]->prob, 
	      si_index_string(si, bhit.value.e[i]->parent),
	      si_index_string(si, bhit.value.e[i]->left),
	      si_index_string(si, bhit.value.e[i]->right));

  for (uhit=sihashursit_init(g.urs); sihashursit_ok(uhit); uhit=sihashursit_next(uhit)) 
    for (i=0; i<uhit.value.n; i++) 
      fprintf(fp, "%g	%s " REWRITES " %s\n", (double) uhit.value.e[i]->prob, 
	      si_index_string(si, uhit.value.e[i]->parent),
	      si_index_string(si, uhit.value.e[i]->child));
}

void
free_grammar(grammar g)
{
  free_sihashurs(g.urs);
  free_sihashbrs(g.brs);
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -