📄 regexlinetagparser.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
字号:
package com.aliasi.corpus.parsers;import com.aliasi.corpus.StringParser;import com.aliasi.corpus.TagHandler;import java.util.ArrayList;import java.util.regex.Matcher;import java.util.regex.Pattern;/** * Provides a means of generating a tag parser based on a extracting * zone boundaries and token/tag pairs from lines of data using * regular expressions.  This provides a useful base implementation of * implementing the CoNLL text parsers, which zone inputs by sentence. * * <p>The parser is specified by means of three regular expressions. * If the ignore regular expression is matched, an input line is * ignored.  This is useful for ignoring empty lines and comments in * some inputs.  The eos regular expression recognizes lines that are * ends of sentences.  Whenever such a line is found, the zone * currently being processed is sent to the handler.  Finally, the * match regular expression is used to extract tags and tokens from * input lines, with the token index and tag index specifying the * subgroup matched in the regular expression. * * <p>Here is a worked example for the CoNLL 2002 data set, a subsequence * of which looks like: * * <blockquote><pre> * -DOCSTART- -DOCSTART- O * Met Prep O * tien Num O * miljoen Num O * komen V O * we Pron O * , Punc O * denk V O * ik Pron O * , Punc O * al Adv O * een Art O * heel Adj O * eind N O * . Punc O *  * Dirk N B-PER * ... * </pre></blockquote> * * And here's the regular expressions used to parse it: *  * <blockquote><pre> * String TOKEN_TAG_LINE_REGEX *     = "(\\S+)\\s(\\S+\\s)?(O|[B|I]-\\S+)"; // token ?posTag entityTag * * int TOKEN_GROUP = 1; // token * int TAG_GROUP = 3;   // entityTag * * String IGNORE_LINE_REGEX *     = "-DOCSTART(.*)";  // lines that start with "-DOCSTART" * * String EOS_REGEX *     = "\\A\\Z";         // empty/blank lines * * Parser parser  *     = new RegexLineTagParser(TOKEN_TAG_LINE_REGEX,  *                              TOKEN_GROUP, TAG_GROUP, *                              IGNORE_LINE_REGEX,  *                              EOS_REGEX); * </pre></blockquote> * * Lines starting with <code>&quot;-DOCSTART&quot;</code> are * ignored, blank lines end sentences; tokens and entity tags * are extracted by matching the regular expression and pulling * out match group 1 as the token and match group 3 as the tag. * An optional part-of-speech tag between the token and tag * on the line is ignored. * * @author  Bob Carpenter * @version 2.4.0 * @since   LingPipe2.4.0 */public class RegexLineTagParser extends StringParser {    private final Pattern mTokenTagPattern;    private final Pattern mIgnoreLinePattern;    private final Pattern mEosPattern;    private final int mTokenGroup;    private final int mTagGroup;    /**     * Construct a regular expression tag parser from the specified     * regular expressions and indexes. See the class documentation     * for further information.     *      * @param matchRegex Regular expression for matching tokens and tags.     * @param tokenGroup Index of group in regular expression for token.     * @param tagGroup Index of group in regular expression for tag.     * @param ignoreRegex Lines matching this regular expression are     * skipped.     * @param eosRegex Matches end of sentence for grouping handle     * events.     */    public RegexLineTagParser(String matchRegex, int tokenGroup, int tagGroup,                              String ignoreRegex,                              String eosRegex) {        this(null,matchRegex,tokenGroup,tagGroup,ignoreRegex,eosRegex);    }    /**     * Construct a regular expression tag parser from the specified     * regular expressions and indexes. See the class documentation     * for further information.     *      * @param handler Tag handler for this parser.     * @param matchRegex Regular expression for matching tokens and tags.     * @param tokenGroup Index of group in regular expression for token.     * @param tagGroup Index of group in regular expression for tag.     * @param ignoreRegex Lines matching this regular expression are     * skipped.     * @param eosRegex Matches end of sentence for grouping handle     * events.     */    public RegexLineTagParser(TagHandler handler,                               String matchRegex, int tokenGroup, int tagGroup,                               String ignoreRegex, String eosRegex) {        super(handler);        mTokenTagPattern = Pattern.compile(matchRegex);        mTokenGroup = tokenGroup;        mTagGroup = tagGroup;        mIgnoreLinePattern = Pattern.compile(ignoreRegex);        mEosPattern = Pattern.compile(eosRegex);    }                              public void parseString(char[] cs, int start, int end) {        String in = new String(cs,start,end-start);        String[] lines = in.split("\n");        ArrayList tokenList = new ArrayList();        ArrayList tagList = new ArrayList();        for (int i = 0; i < lines.length; ++i) {            Matcher lineIgnorer = mIgnoreLinePattern.matcher(lines[i]);            if (lineIgnorer.matches()) continue;            Matcher eosMatcher = mEosPattern.matcher(lines[i]);            if (eosMatcher.matches()) {                handle(tokenList,tagList);                continue;            }            Matcher matcher = mTokenTagPattern.matcher(lines[i]);            if (!matcher.matches()) {                String msg = "Illegal line=" + lines[i];                throw new IllegalArgumentException(msg);            }            String token = matcher.group(mTokenGroup);            String tag = matcher.group(mTagGroup);            tokenList.add(token);            tagList.add(tag);        }        handle(tokenList,tagList);    }    /**     * Returns the tag handler for this tag parser.  This     * is just a convenience cast of {@link #getHandler()}.     *      * @return Tag handler.     */    public TagHandler getTagHandler() {        return (TagHandler) getHandler();    }    private void handle(ArrayList tokenList, ArrayList tagList) {        int len = tokenList.size();        String[] tokens = (String[]) tokenList.toArray(new String[len]);        String[] tags = (String[]) tagList.toArray(new String[len]);        getTagHandler().handle(tokens,null,tags);        tokenList.clear();        tagList.clear();    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -