⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 regexp.java

📁 一个Web爬虫(机器人
💻 JAVA
字号:
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University  *  * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software  * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;import java.util.Vector;import java.util.Enumeration;import java.io.IOException;//#ifdef JDK1.1import java.io.ObjectInputStream;//#endif JDK1.1public class Regexp extends Pattern {    static com.oroinc.text.regex.PatternCompiler compiler             = new com.oroinc.text.regex.Perl5Compiler ();    String stringRep;    transient com.oroinc.text.regex.Pattern pattern;    transient String[] fields;    public Regexp (String pattern) {        stringRep = pattern;        init ();    }        public boolean equals (Object object) {        if (! (object instanceof Regexp))            return false;        Regexp p = (Regexp)object;        return p.stringRep.equals (stringRep);    }                //#ifdef JDK1.1    private void readObject (ObjectInputStream in)            throws IOException, ClassNotFoundException {        in.defaultReadObject ();        init ();    }    //#endif JDK1.1        private void init () {        synchronized (compiler) {            try {                this.pattern = compiler.compile (translateFields (stringRep));            } catch (com.oroinc.text.regex.MalformedPatternException e) {                throw new RuntimeException ("syntax error in pattern: " + pattern);            }        }    }        public String[] getFieldNames () {        return fields;    }        public String toString () {        return stringRep;    }    public PatternMatcher match (Region region) {        return new RegexpMatcher (this, region);    }    public static String escape (String s) {        return websphinx.util.Str.escape (s, '\\', "\\^.$|()[]*+?{}");    }    String translateFields (String s) {        Vector vfields = new Vector ();        boolean inEscape = false;        StringBuffer output = new StringBuffer ();        int len = s.length ();        for (int i=0; i<len; ++i) {            char c = s.charAt (i);            if (inEscape) {                output.append (c);                inEscape = false;            }            else {                switch (c) {                  case '\\':                    output.append (c);                    inEscape = true;                    break;                  case '(':                    output.append (c);                    if (s.startsWith ("?{", i+1)) {                        int start = i+3;                        int end = s.indexOf ('}', start);                        vfields.addElement (s.substring (start, end));                        i = end;                    }                    else if (!s.startsWith ("?", i+1))                        vfields.addElement (String.valueOf (vfields.size()));                    break;                  default:                    output.append (c);                    break;                }            }        }        fields = new String[vfields.size()];        vfields.copyInto (fields);        return output.toString ();    }        public static void main (String[] args) throws Exception {        if (args.length < 2) {            System.err.println ("usage: Regexp <pattern> <source URL>*");            return;        }        Pattern p = new Regexp (args[0].replace ('_', ' ') );        for (int i=1; i<args.length; ++i) {            Page page = new Page (new Link (args[i]));            System.out.println ("--------------------" + args[i]);            PatternMatcher m = p.match (page);            for (Region r = m.nextMatch(); r != null; r = m.nextMatch()) {                System.out.println ("[" + r.getStart() + "," + r.getEnd() + "]" + r);                Enumeration enum = r.enumerateObjectLabels ();                while (enum.hasMoreElements ()) {                    String lbl = (String)enum.nextElement ();                    Object object = r.getObjectLabel (lbl);                    if (object instanceof Region) {                        Region s = (Region)object;                        System.out.println ("    "+lbl+"=[" + s.getStart() + "," + s.getEnd() + "]" + s);                    }                }            }        }    }}class RegexpMatcher extends PatternMatcher {    com.oroinc.text.regex.PatternMatcher matcher = new com.oroinc.text.regex.Perl5Matcher ();    Regexp regexp;    Region source;    com.oroinc.text.regex.PatternMatcherInput input;    public RegexpMatcher (Regexp regexp, Region source) {        this.regexp = regexp;        this.source = source;        this.input = new com.oroinc.text.regex.PatternMatcherInput (source.getSource().getContent(),                                                                   source.getStart(), source.getLength ());    }    protected Region findNext () {        if (matcher.contains (input, regexp.pattern)) {            com.oroinc.text.regex.MatchResult m = matcher.getMatch ();            Page page = source.getSource ();                        Region match = new Region (page, m.beginOffset (0), m.endOffset (0));                        int n = m.groups()-1;            Region[] groups = new Region[n];            for (int i=0; i<n; ++i) {                Region r = new Region (page, m.beginOffset (i+1), m.endOffset (i+1));                groups[i] = r;                match.setField (regexp.fields[i], r);            }            match.setFields (Pattern.groups, groups);            return match;        }        else            return null;    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -