📄 reuters21578parser.java
字号:
boolean includeTestDocuments) { mIncludeTrainingDocuments = includeTrainingDocuments; mIncludeTestDocuments = includeTestDocuments; mTopic = topic; if (!isAvailableTopic(mTopic)) { String msg = "Require known topic." + " Found topic=" + topic; throw new IllegalArgumentException(msg); } } /** * Implements the parser for character array slices. All other * parse methods eventually call this implementation. * * @param cs Underlying character array. * @param start Index of first character in the slice. * @param end Index of the last character in the slice plus 1. */ public void parseString(char[] cs, int start, int end) { String text = new String(cs,start,end-start); String[] lines = text.split("\n"); for (int i = 0; i < lines.length; ++i) { if (!lines[i].startsWith("<REUTERS")) continue; StringBuilder sb = new StringBuilder(); while (!lines[i].startsWith("</REUTERS")) { sb.append(lines[i++]); sb.append("\n"); } handleDocument(sb.toString()); } } void handleDocument(String text) { if (!hasTopics(text)) return; if (isTrainingDocument(text) && !mIncludeTrainingDocuments) return; if (isTestDocument(text) && !mIncludeTestDocuments) return; String topics = extract("TOPICS",text,true); String title = extract("TITLE",text,true); String dateline = extract("DATELINE",text,true); String body = extract("BODY",text,true); if (body.endsWith(END_BOILERPLATE_1) || body.endsWith(END_BOILERPLATE_2)) body = body.substring(0,body.length() - END_BOILERPLATE_1.length()); StringBuilder sb = new StringBuilder(); sb.append(title + "\n"); sb.append(dateline + "\n"); sb.append(body); boolean hasTopic = topics.indexOf(mTopic) >= 0; Classification classification = hasTopic ? ON_TOPIC : OFF_TOPIC; getHandler().handle(sb,classification); } static String extract(String elt, String text, boolean allowEmpty) { String startElt = "<" + elt + ">"; String endElt = "</" + elt + ">"; int startEltIndex = text.indexOf(startElt); if (startEltIndex < 0) { if (allowEmpty) return ""; throw new IllegalArgumentException("no start, elt=" + elt + " text=" + text); } int start = startEltIndex + startElt.length(); int end = text.indexOf(endElt,start); if (end < 0) throw new IllegalArgumentException("no end, elt=" + elt + " text=" + text); return text.substring(start,end); } static final Classification ON_TOPIC = new Classification(BinaryLMClassifier.DEFAULT_ACCEPT_CATEGORY); static final Classification OFF_TOPIC = new Classification(BinaryLMClassifier.DEFAULT_REJECT_CATEGORY); static final String END_BOILERPLATE_1 = "Reuter"; static final String END_BOILERPLATE_2 = "REUTER"; static final String[] TOPICS = { "acq", "alum", "austdlr", "barley", "bean", "belly", "bfr", "bop", "cake", "can", "carcass", "castor", "castorseed", "cattle", "chem", "citruspulp", "cocoa", "coconut", "coffee", "copper", "copra", "corn", "cornglutenfeed", "cotton", "cottonseed", "cpi", "cpu", "crude", "cruzado", "debt", "dfl", "dkr", "dlr", "dmk", "earn", "f", "feed", "fishmeal", "fuel", "fx", "gas", "gnp", "gold", "grain", "groundnut", "heat", "hk", "hog", "housing", "income", "instal", "interest", "inventories", "ipi", "iron", "jet", "jobs", "l", "lead", "lei", "lin", "linseed", "lit", "livestock", "lumber", "meal", "metal", "money", "naphtha", "nat", "nickel", "nkr", "nzdlr", "oat", "oil", "oilseed", "orange", "palladium", "palm", "palmkernel", "peseta", "pet", "platinum", "plywood", "pork", "potato", "propane", "rand", "rape", "rapeseed", "red", "reserves", "retail", "rice", "ringgit", "rubber", "rupiah", "rye", "saudriyal", "sfr", "ship", "silver", "skr", "sorghum", "soy", "soybean", "steel", "stg", "strategic", "sugar", "sun", "sunseed", "supply", "tapioca", "tea", "tin", "trade", "veg", "wheat", "wool", "wpi", "yen", "zinc" }; /** * Returns an array consisting of all of the available topics in * the Reuters collection. The complete list is shown in the * class javadoc above. * * <p>The list is a copy, so changing it has no effect on this class. * * @return The topics for the Reuters collection. */ public static String[] availableTopics() { String[] topics = new String[TOPICS.length]; for (int i = 0; i < topics.length; ++i) topics[i] = TOPICS[i]; return topics; } /** * Returns <code>true</code> if the specified topic is * available in the Reuters collection. * * @param topic Topic to test. * @return <code>true</code> if it available for classification. */ public static boolean isAvailableTopic(String topic) { for (String validTopic : TOPICS) if (validTopic.equals(topic)) return true; return false; } /** * Returns the corpus representation of the Reuters collection, * for the specified topic, reading the SGML files from the specified * directory. * * <p>The directory specified is read each time the methods of the * returned corpus are called. This streams the relevant parts of * the corpus as needed, which requires less memory, but more * time. It also requires the directory to stick around until needed. * * @param topic Topic for the corpus. * @param directory Directory in which to find the corpus files. * @return The corpus for the specified topic. * @throws IOException If there is an underlying I/O error reading * the corpus data. * @throws IllegalArgumentException If the topic is not available * in the Reuters collection. */ public static Corpus<ClassificationHandler<CharSequence,Classification>> corpus(String topic, File directory) throws IOException { return new ReutersCorpus(topic,directory); } static boolean hasTopics(String document) { return containsText(document,"TOPICS=\"Y"); } static boolean isTrainingDocument(String document) { return containsText(document,"LEWISSPLIT=\"TR"); } static boolean isTestDocument(String document) { return containsText(document,"LEWISSPLIT=\"TE"); } static boolean containsText(String doc, String text) { return doc.indexOf(text) >= 0; } private static class ReutersCorpus extends Corpus<ClassificationHandler<CharSequence,Classification>> { private final String mTopic; private final File mDirectory; ReutersCorpus(String topic, File directory) { mTopic = topic; mDirectory = directory; } public void visitCorpus(ClassificationHandler<CharSequence,Classification> handler) throws IOException { visit(handler,true,true); } public void visitTest(ClassificationHandler<CharSequence,Classification> handler) throws IOException { visit(handler,false,true); } public void visitTrain(ClassificationHandler<CharSequence,Classification> handler) throws IOException { visit(handler,true,false); } void visit(ClassificationHandler<CharSequence,Classification> handler, boolean includeTrain, boolean includeTest) throws IOException { Reuters21578Parser parser = new Reuters21578Parser(mTopic,includeTrain,includeTest); parser.setHandler(handler); for (File file : mDirectory.listFiles(new FileExtensionFilter(".sgm"))) parser.parse(file); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -