testdocumentextraction.java

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Java 代码 · 共 153 行

JAVA
153
字号
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.base.extract.test;import junit.framework.*;import edu.umass.cs.mallet.base.types.LabelAlphabet;import edu.umass.cs.mallet.base.types.LabelSequence;import edu.umass.cs.mallet.base.types.Label;import edu.umass.cs.mallet.base.extract.*;import edu.umass.cs.mallet.base.util.CharSequenceLexer;import java.util.regex.Pattern;/** * Created: Oct 12, 2004 * * @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A> * @version $Id: TestDocumentExtraction.java,v 1.4 2004/11/20 00:57:19 casutton Exp $ */public class TestDocumentExtraction extends TestCase {  public TestDocumentExtraction (String name)  {    super (name);  }  public static Test suite ()  {    return new TestSuite (TestDocumentExtraction.class);  }  public void testToXml () {    LabelAlphabet dict = new LabelAlphabet ();    String document = "the quick brown fox leapt over the lazy dog";    StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());    Label O = dict.lookupLabel ("O");    Label ANML = dict.lookupLabel ("ANIMAL");    Label VB = dict.lookupLabel ("VERB");    LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML });    DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O");    String actualXml = extr.toXmlString();    String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +            "<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";    assertEquals (expectedXml, actualXml);  }   public void testToXmlBIO () {    LabelAlphabet dict = new LabelAlphabet ();    String document = "the quick brown fox leapt over the lazy dog";    StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());    Label O = dict.lookupLabel ("O");    Label BANML = dict.lookupLabel ("B-ANIMAL");    Label ANML = dict.lookupLabel ("ANIMAL");    Label BVB = dict.lookupLabel ("B-VERB");    Label VB = dict.lookupLabel ("I-VERB");    LabelSequence tags = new LabelSequence (new Label[] { O, BANML, ANML, BANML, BVB, VB, O, ANML, ANML });    DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new BIOTokenizationFilter());    String actualXml = extr.toXmlString();    String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +            "<doc>the <ANIMAL>quick brown </ANIMAL><ANIMAL>fox </ANIMAL><VERB>leapt over </VERB>the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";    assertEquals (expectedXml, actualXml);  }  public void testNestedToXML ()  {    LabelAlphabet dict = new LabelAlphabet ();    String document = "the quick brown fox leapt over the lazy dog";    StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());    Label O = dict.lookupLabel ("O");    Label ANML = dict.lookupLabel ("ANIMAL");    Label VB = dict.lookupLabel ("VERB");    Label JJ = dict.lookupLabel ("ADJ");    Label MAMMAL = dict.lookupLabel ("MAMMAL");    LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, ANML, ANML, ANML });    LabeledSpans spans = new DefaultTokenizationFilter ().constructLabeledSpans (dict, document, O, toks, tags);    Span foxToken = toks.subspan (3, 4);    spans.add (new LabeledSpan (foxToken, MAMMAL, false));    Span bigDogToken = toks.subspan (7, 8);    spans.add (new LabeledSpan (bigDogToken, JJ, false));    DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, spans, null, "O");    String actualXml = extr.toXmlString();    String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +            "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy </ADJ>dog</ANIMAL></doc>\r\n";    assertEquals (expectedXml, actualXml);  }  public void testNestedXMLTokenizationFilter ()  {    LabelAlphabet dict = new LabelAlphabet ();    String document = "the quick brown fox leapt over the lazy dog";    StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());    Label O = dict.lookupLabel ("O");    Label ANML = dict.lookupLabel ("ANIMAL");    Label ANML_MAMM = dict.lookupLabel ("ANIMAL|MAMMAL");    Label VB = dict.lookupLabel ("VERB");    Label ANML_JJ = dict.lookupLabel ("ANIMAL|ADJ");    Label ANML_JJ_MAMM = dict.lookupLabel ("ANIMAL|ADJ|MAMMAL");    LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML_MAMM, VB, O, ANML, ANML_JJ, ANML_JJ_MAMM });    DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter ());    String actualXml = extr.toXmlString();    String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +            "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy <MAMMAL>dog</MAMMAL></ADJ></ANIMAL></doc>\r\n";    assertEquals (expectedXml, actualXml);    // Test the ignore function    extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter (Pattern.compile ("AD.*")));    actualXml = extr.toXmlString();    expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +            "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the lazy <MAMMAL>dog</MAMMAL></ANIMAL></doc>\r\n";    assertEquals (expectedXml, actualXml);  }  public static void main (String[] args) throws Throwable  {    TestSuite theSuite;    if (args.length > 0) {      theSuite = new TestSuite ();      for (int i = 0; i < args.length; i++) {        theSuite.addTest (new TestDocumentExtraction (args[i]));      }    } else {      theSuite = (TestSuite) suite ();    }    junit.textui.TestRunner.run (theSuite);  }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?