📄 urlparser.java
字号:
/*
* @(#)URLParser.java 4/01/2005
*
* Copyright (c) 2004, 2005 jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* 3. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* 4. Any modification or additions to the software must be contributed back
* to the project.
*
* 5. Any investigation or reverse engineering of source code or binary to
* enable emails to bypass the filters, and hence inflict spam and or viruses
* onto users who use or do not use jASEN could subject the perpetrator to
* criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.parsers;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.LinkedList;
import java.util.List;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
import org.jasen.util.DNSUtils;
/**
*
* <P>
* Looks specifically for URL sequences in email content, both text and HTML.
* </P>
* <p>
* The rationale here is than two spam emails with different content may in fact be referening the same url.
* </p>
* <p>
* This also provides for future enhancements based on blocking of content associated with black-listed domains
* </p>
* @author Jason Polites
*/
public class URLParser extends ParserCallback {
private List urls = null;
private String[] urlArray = null;
private String prefix;
public static final String URL_PREFIX = "url|"; // Prepended to urls found
/**
* This array MUST be sorted to faciliate a binary search
*/
public static String[] URL_WORDS = {
"ftp",
"http",
"https",
"mailto",
"www"
};
// Sort the relevant arrays
static {
Arrays.sort(URL_WORDS);
}
public URLParser() {
this.prefix = URL_PREFIX;
}
public URLParser(String prefix) {
this.prefix = prefix;
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if(t.equals(HTML.Tag.A)) {
getAnchorUrl(a);
}
}
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleSimpleTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
*/
public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
if(t.equals(HTML.Tag.IMG)) {
getImageUrl(a);
}
}
private void getAnchorUrl(MutableAttributeSet a) {
getAttributeUrl(a, HTML.Attribute.HREF);
}
private void getImageUrl(MutableAttributeSet a) {
getAttributeUrl(a, HTML.Attribute.SRC);
}
private void getAttributeUrl(MutableAttributeSet a, HTML.Attribute attr) {
Enumeration e = a.getAttributeNames();
HTML.Attribute key = null;
Object objKey = null;
while(e.hasMoreElements()) {
objKey = e.nextElement();
if(objKey instanceof HTML.Attribute) {
key = (HTML.Attribute)objKey;
if(key.equals(attr)) {
getUrl(a,key);
}
}
}
}
private void getUrl(MutableAttributeSet a, HTML.Attribute key) {
URL url = null;
String host;
String user;
boolean cut = false;
String strUrl;
try {
strUrl = a.getAttribute(key).toString().toLowerCase();
if(strUrl.indexOf("http://") <= -1) {
strUrl = "http://" + strUrl;
}
url = new URL(strUrl);
host = clean(url.getHost());
user = clean(url.getUserInfo());
String[] split = null;
if(host != null || user != null) {
if(urls == null) urls = new LinkedList();
if(host != null) {
host = DNSUtils.getValidDomainOnly(host);
}
if(user != null) {
split = user.split("\\.");
if (split.length <= 1) {
// The user is invalid
user = null;
}
}
if(host != null && host.trim().length() > 0) urls.add(prefix(host));
if(user != null && user.trim().length() > 0) urls.add(prefix(user));
}
}
catch (MalformedURLException e) {
// Ignore the malformed url..
}
}
public void parse(String str) throws IOException {
parse(new StringReader(str));
}
public void parse(InputStream in) throws IOException {
parse(new InputStreamReader(in));
}
public void parse(Reader in) throws IOException {
ParserDelegator delegator = new ParserDelegator();
delegator.parse(in, this, true);
}
/**
* Removes non ascii chars
* @param str
* @return
*/
private String clean(String str) {
char chr;
StringBuffer buffer = null;
if(str != null) {
buffer = new StringBuffer();
for (int i = 0; i < str.length(); i++) {
chr = str.charAt(i);
if(chr >= 32 && chr <= 127) {
buffer.append(chr);
}
}
}
if(buffer != null) {
return buffer.toString();
}
else
{
return null;
}
}
private String prefix(String str) {
str = prefix + str;
return str;
}
/**
* Returns the contents of the parser as an array of String objects
* @return
*/
public String[] getUrlArray() {
if(urlArray == null && urls != null) {
urlArray = (String[])urls.toArray(new String[urls.size()]);
}
return urlArray;
}
/**
* Returns the list of URL objects as Strings
* @return
*/
public List getUrls() {
return urls;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -