📄 regexp.java
字号:
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University * * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;import java.util.Vector;import java.util.Enumeration;import java.io.IOException;//#ifdef JDK1.1import java.io.ObjectInputStream;//#endif JDK1.1public class Regexp extends Pattern { static com.oroinc.text.regex.PatternCompiler compiler = new com.oroinc.text.regex.Perl5Compiler (); String stringRep; transient com.oroinc.text.regex.Pattern pattern; transient String[] fields; public Regexp (String pattern) { stringRep = pattern; init (); } public boolean equals (Object object) { if (! (object instanceof Regexp)) return false; Regexp p = (Regexp)object; return p.stringRep.equals (stringRep); } //#ifdef JDK1.1 private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject (); init (); } //#endif JDK1.1 private void init () { synchronized (compiler) { try { this.pattern = compiler.compile (translateFields (stringRep)); } catch (com.oroinc.text.regex.MalformedPatternException e) { throw new RuntimeException ("syntax error in pattern: " + pattern); } } } public String[] getFieldNames () { return fields; } public String toString () { return stringRep; } public PatternMatcher match (Region region) { return new RegexpMatcher (this, region); } public static String escape (String s) { return websphinx.util.Str.escape (s, '\\', "\\^.$|()[]*+?{}"); } String translateFields (String s) { Vector vfields = new Vector (); boolean inEscape = false; StringBuffer output = new StringBuffer (); int len = s.length (); for (int i=0; i<len; ++i) { char c = s.charAt (i); if (inEscape) { output.append (c); inEscape = false; } else { switch (c) { case '\\': output.append (c); inEscape = true; break; case '(': output.append (c); if (s.startsWith ("?{", i+1)) { int start = i+3; int end = s.indexOf ('}', start); vfields.addElement (s.substring (start, end)); i = end; } else if (!s.startsWith ("?", i+1)) vfields.addElement (String.valueOf (vfields.size())); break; default: output.append (c); break; } } } fields = new String[vfields.size()]; vfields.copyInto (fields); return output.toString (); } public static void main (String[] args) throws Exception { if (args.length < 2) { System.err.println ("usage: Regexp <pattern> <source URL>*"); return; } Pattern p = new Regexp (args[0].replace ('_', ' ') ); for (int i=1; i<args.length; ++i) { Page page = new Page (new Link (args[i])); System.out.println ("--------------------" + args[i]); PatternMatcher m = p.match (page); for (Region r = m.nextMatch(); r != null; r = m.nextMatch()) { System.out.println ("[" + r.getStart() + "," + r.getEnd() + "]" + r); Enumeration enum = r.enumerateObjectLabels (); while (enum.hasMoreElements ()) { String lbl = (String)enum.nextElement (); Object object = r.getObjectLabel (lbl); if (object instanceof Region) { Region s = (Region)object; System.out.println (" "+lbl+"=[" + s.getStart() + "," + s.getEnd() + "]" + s); } } } } }}class RegexpMatcher extends PatternMatcher { com.oroinc.text.regex.PatternMatcher matcher = new com.oroinc.text.regex.Perl5Matcher (); Regexp regexp; Region source; com.oroinc.text.regex.PatternMatcherInput input; public RegexpMatcher (Regexp regexp, Region source) { this.regexp = regexp; this.source = source; this.input = new com.oroinc.text.regex.PatternMatcherInput (source.getSource().getContent(), source.getStart(), source.getLength ()); } protected Region findNext () { if (matcher.contains (input, regexp.pattern)) { com.oroinc.text.regex.MatchResult m = matcher.getMatch (); Page page = source.getSource (); Region match = new Region (page, m.beginOffset (0), m.endOffset (0)); int n = m.groups()-1; Region[] groups = new Region[n]; for (int i=0; i<n; ++i) { Region r = new Region (page, m.beginOffset (i+1), m.endOffset (i+1)); groups[i] = r; match.setField (regexp.fields[i], r); } match.setFields (Pattern.groups, groups); return match; } else return null; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -