📄 urlparser.java

📁 spam source codejasen-0.9jASEN - java Anti Spam ENgine.zip 如标题所示
💻 JAVA
字号:
/*
 * @(#)URLParser.java	4/01/2005
 * 
 * Copyright (c) 2004, 2005  jASEN.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright notice,
 *      this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in
 *      the documentation and/or other materials provided with the distribution.
 *
 *   3. The names of the authors may not be used to endorse or promote products
 *      derived from this software without specific prior written permission.
 *
 *   4. Any modification or additions to the software must be contributed back
 *      to the project.
 *
 *   5. Any investigation or reverse engineering of source code or binary to
 *      enable emails to bypass the filters, and hence inflict spam and or viruses
 *      onto users who use or do not use jASEN could subject the perpetrator to
 *      criminal and or civil liability.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
 * OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */
package org.jasen.core.parsers;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.LinkedList;
import java.util.List;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;

import org.jasen.util.DNSUtils;

/**
 *
 * <P>
 *  Looks specifically for URL sequences in email content, both text and HTML.
 * </P>
 * <p>
 * 	The rationale here is than two spam emails with different content may in fact be referening the same url.
 * </p>
 * <p>
 * 	This also provides for future enhancements based on blocking of content associated with black-listed domains
 * </p>
 * @author Jason Polites
 */
public class URLParser extends ParserCallback {

	private List urls = null;
	private String[] urlArray = null;
	private String prefix;

	public static final String URL_PREFIX = "url|"; // Prepended to urls found

	/**
	 * This array MUST be sorted to faciliate a binary search
	 */
	public static String[] URL_WORDS = {
		"ftp",
		"http",
		"https",
		"mailto",
		"www"
	};


	// Sort the relevant arrays
	static {
		Arrays.sort(URL_WORDS);
	}


	public URLParser() {
		this.prefix = URL_PREFIX;
	}

	public URLParser(String prefix) {
		this.prefix = prefix;
	}

	public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
		if(t.equals(HTML.Tag.A)) {
			getAnchorUrl(a);
		}
	}


	/* (non-Javadoc)
	 * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleSimpleTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
	 */
	public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
		if(t.equals(HTML.Tag.IMG)) {
			getImageUrl(a);
		}
	}


	private void getAnchorUrl(MutableAttributeSet a) {
		getAttributeUrl(a, HTML.Attribute.HREF);
	}

	private void getImageUrl(MutableAttributeSet a) {
		getAttributeUrl(a, HTML.Attribute.SRC);
	}

	private void getAttributeUrl(MutableAttributeSet a, HTML.Attribute attr) {
		Enumeration e = a.getAttributeNames();

		HTML.Attribute key = null;
		Object objKey = null;


		while(e.hasMoreElements()) {

			objKey = e.nextElement();

			if(objKey instanceof HTML.Attribute) {

				key = (HTML.Attribute)objKey;

				if(key.equals(attr)) {

					getUrl(a,key);
				}
			}
		}
	}

	private void getUrl(MutableAttributeSet a, HTML.Attribute key) {

		URL url = null;
		String host;
		String user;
		boolean cut = false;

		String strUrl;

		try {

			strUrl = a.getAttribute(key).toString().toLowerCase();


			if(strUrl.indexOf("http://") <= -1) {
				strUrl = "http://" + strUrl;
			}

			url = new URL(strUrl);

			host = clean(url.getHost());
			user = clean(url.getUserInfo());

			String[] split = null;

			if(host != null || user != null) {
				if(urls == null) urls = new LinkedList();

				if(host != null) {
					host = DNSUtils.getValidDomainOnly(host);
				}

				if(user != null) {
					split = user.split("\\.");

					if (split.length <= 1)	{
						// The user is invalid
						user = null;
					}
				}

				if(host != null && host.trim().length() > 0) urls.add(prefix(host));
				if(user != null && user.trim().length() > 0) urls.add(prefix(user));
			}
		}
		catch (MalformedURLException e) {
			// Ignore the malformed url..
		}

	}

	public void parse(String str) throws IOException {
		parse(new StringReader(str));
	}

	public void parse(InputStream in) throws IOException {
		parse(new InputStreamReader(in));
	}

	public void parse(Reader in) throws IOException {
		ParserDelegator delegator = new ParserDelegator();
		delegator.parse(in, this, true);
	}

	/**
	 * Removes non ascii chars
	 * @param str
	 * @return
	 */
	private String clean(String str) {
		char chr;
		StringBuffer buffer = null;

		if(str != null) {

			buffer = new StringBuffer();

			for (int i = 0; i < str.length(); i++) {
				chr = str.charAt(i);

				if(chr >= 32 && chr <= 127) {
					buffer.append(chr);
				}
			}
		}

		if(buffer != null) {
			return buffer.toString();
		}
		else
		{
			return null;
		}

	}

	private String prefix(String str) {
		str = prefix + str;
		return str;
	}

	/**
	 * Returns the contents of the parser as an array of String objects
	 * @return
	 */
	public String[] getUrlArray() {
		if(urlArray == null && urls != null) {
			urlArray = (String[])urls.toArray(new String[urls.size()]);
		}
		return urlArray;
	}

	/**
	 * Returns the list of URL objects as Strings
	 * @return
	 */
	public List getUrls() {
		return urls;
	}
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -