📄 reuters21578parser.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
                              boolean includeTestDocuments) {        mIncludeTrainingDocuments = includeTrainingDocuments;        mIncludeTestDocuments = includeTestDocuments;        mTopic = topic;        if (!isAvailableTopic(mTopic)) {            String msg = "Require known topic."                + " Found topic=" + topic;            throw new IllegalArgumentException(msg);        }    }    /**     * Implements the parser for character array slices.  All other     * parse methods eventually call this implementation.     *     * @param cs Underlying character array.     * @param start Index of first character in the slice.     * @param end Index of the last character in the slice plus 1.     */    public void parseString(char[] cs, int start, int end) {        String text = new String(cs,start,end-start);        String[] lines = text.split("\n");        for (int i = 0; i < lines.length; ++i) {            if (!lines[i].startsWith("<REUTERS")) continue;            StringBuilder sb = new StringBuilder();            while (!lines[i].startsWith("</REUTERS")) {                sb.append(lines[i++]);                sb.append("\n");            }            handleDocument(sb.toString());        }    }    void handleDocument(String text) {        if (!hasTopics(text)) return;        if (isTrainingDocument(text) && !mIncludeTrainingDocuments) return;        if (isTestDocument(text) && !mIncludeTestDocuments) return;        String topics = extract("TOPICS",text,true);        String title = extract("TITLE",text,true);        String dateline = extract("DATELINE",text,true);        String body = extract("BODY",text,true);        if (body.endsWith(END_BOILERPLATE_1) || body.endsWith(END_BOILERPLATE_2))            body = body.substring(0,body.length() - END_BOILERPLATE_1.length());        StringBuilder sb = new StringBuilder();        sb.append(title + "\n");        sb.append(dateline + "\n");        sb.append(body);        boolean hasTopic = topics.indexOf(mTopic) >= 0;        Classification classification = hasTopic ? ON_TOPIC : OFF_TOPIC;        getHandler().handle(sb,classification);    }    static String extract(String elt, String text, boolean allowEmpty) {        String startElt = "<" + elt + ">";        String endElt = "</" + elt + ">";        int startEltIndex = text.indexOf(startElt);        if (startEltIndex < 0) {            if (allowEmpty) return "";            throw new IllegalArgumentException("no start, elt=" + elt + " text=" + text);        }        int start = startEltIndex + startElt.length();        int end = text.indexOf(endElt,start);        if (end < 0) throw new IllegalArgumentException("no end, elt=" + elt + " text=" + text);        return text.substring(start,end);    }    static final Classification ON_TOPIC        = new Classification(BinaryLMClassifier.DEFAULT_ACCEPT_CATEGORY);    static final Classification OFF_TOPIC        = new Classification(BinaryLMClassifier.DEFAULT_REJECT_CATEGORY);    static final String END_BOILERPLATE_1 = "Reuter&#3;";    static final String END_BOILERPLATE_2 = "REUTER&#3;";    static final String[] TOPICS = {        "acq",        "alum",        "austdlr",        "barley",        "bean",        "belly",        "bfr",        "bop",        "cake",        "can",        "carcass",        "castor",        "castorseed",        "cattle",        "chem",        "citruspulp",        "cocoa",        "coconut",        "coffee",        "copper",        "copra",        "corn",        "cornglutenfeed",        "cotton",        "cottonseed",        "cpi",        "cpu",        "crude",        "cruzado",        "debt",        "dfl",        "dkr",        "dlr",        "dmk",        "earn",        "f",        "feed",        "fishmeal",        "fuel",        "fx",        "gas",        "gnp",        "gold",        "grain",        "groundnut",        "heat",        "hk",        "hog",        "housing",        "income",        "instal",        "interest",        "inventories",        "ipi",        "iron",        "jet",        "jobs",        "l",        "lead",        "lei",        "lin",        "linseed",        "lit",        "livestock",        "lumber",        "meal",        "metal",        "money",        "naphtha",        "nat",        "nickel",        "nkr",        "nzdlr",        "oat",        "oil",        "oilseed",        "orange",        "palladium",        "palm",        "palmkernel",        "peseta",        "pet",        "platinum",        "plywood",        "pork",        "potato",        "propane",        "rand",        "rape",        "rapeseed",        "red",        "reserves",        "retail",        "rice",        "ringgit",        "rubber",        "rupiah",        "rye",        "saudriyal",        "sfr",        "ship",        "silver",        "skr",        "sorghum",        "soy",        "soybean",        "steel",        "stg",        "strategic",        "sugar",        "sun",        "sunseed",        "supply",        "tapioca",        "tea",        "tin",        "trade",        "veg",        "wheat",        "wool",        "wpi",        "yen",        "zinc"    };    /**     * Returns an array consisting of all of the available topics in     * the Reuters collection.  The complete list is shown in the     * class javadoc above.     *     * <p>The list is a copy, so changing it has no effect on this class.     *     * @return The topics for the Reuters collection.     */    public static String[] availableTopics() {        String[] topics = new String[TOPICS.length];        for (int i = 0; i < topics.length; ++i)            topics[i] = TOPICS[i];        return topics;    }    /**     * Returns <code>true</code> if the specified topic is     * available in the Reuters collection.     *     * @param topic Topic to test.     * @return <code>true</code> if it available for classification.     */    public static boolean isAvailableTopic(String topic) {        for (String validTopic : TOPICS)            if (validTopic.equals(topic))                return true;        return false;    }    /**     * Returns the corpus representation of the Reuters collection,     * for the specified topic, reading the SGML files from the specified     * directory.     *     * <p>The directory specified is read each time the methods of the     * returned corpus are called.  This streams the relevant parts of     * the corpus as needed, which requires less memory, but more     * time.  It also requires the directory to stick around until needed.     *     * @param topic Topic for the corpus.     * @param directory Directory in which to find the corpus files.     * @return The corpus for the specified topic.     * @throws IOException If there is an underlying I/O error reading     * the corpus data.     * @throws IllegalArgumentException If the topic is not available     * in the Reuters collection.     */    public static Corpus<ClassificationHandler<CharSequence,Classification>>        corpus(String topic, File directory) throws IOException {        return new ReutersCorpus(topic,directory);    }    static boolean hasTopics(String document) {        return containsText(document,"TOPICS=\"Y");    }    static boolean isTrainingDocument(String document) {        return containsText(document,"LEWISSPLIT=\"TR");    }    static boolean isTestDocument(String document) {        return containsText(document,"LEWISSPLIT=\"TE");    }    static boolean containsText(String doc, String text) {        return doc.indexOf(text) >= 0;    }    private static class ReutersCorpus        extends Corpus<ClassificationHandler<CharSequence,Classification>> {        private final String mTopic;        private final File mDirectory;        ReutersCorpus(String topic, File directory) {            mTopic = topic;            mDirectory = directory;        }        public void visitCorpus(ClassificationHandler<CharSequence,Classification> handler)            throws IOException {            visit(handler,true,true);        }        public void visitTest(ClassificationHandler<CharSequence,Classification> handler)            throws IOException {            visit(handler,false,true);        }        public void visitTrain(ClassificationHandler<CharSequence,Classification> handler)            throws IOException {            visit(handler,true,false);        }        void visit(ClassificationHandler<CharSequence,Classification> handler,                   boolean includeTrain, boolean includeTest)            throws IOException {            Reuters21578Parser parser = new Reuters21578Parser(mTopic,includeTrain,includeTest);            parser.setHandler(handler);            for (File file : mDirectory.listFiles(new FileExtensionFilter(".sgm")))                parser.parse(file);        }    }}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -