📄 extracttext.java

📁 ajax lucene 部分源代码 HTMLParser.java MuiltiSearchTest.java
💻 JAVA
字号:
package test;

import net.htmlparser.jericho.*;

import java.util.*;
import java.io.*;
import java.net.*;

public class ExtractText {
	public static void main(String[] args) throws Exception {

	}

	public String getConent() throws Exception {
		// String sourceUrlString="http://blog.s135.com/nginx_php_v5/";

		String sourceUrlString = "http://www.11467.com/";
		sourceUrlString = "http://www.baidu.com/s?lm=0&si=&rn=10&ie=gb2312&ct=0&wd=%BF%AA%B7%A2%D7%D4%BC%BA%B5%C4%CB%D1%CB%F7%D2%FD%C7%E6++%D4%B4%B4%FA%C2%EB&pn=0&ver=0&cl=3&uim=0&usm=0";
		sourceUrlString = "http://mall.sina.com.cn/product_1436088.htm";
		InputStream inputstream = new FileInputStream(
				"F:\\lucene\\test\\swtfaq.html");
		// InputStream
		// if (args.length==0)
		// System.err.println("Using default argument of
		// \""+sourceUrlString+'"');
		// else
		// sourceUrlString=args[0];
		// if (sourceUrlString.indexOf(':')==-1)
		// sourceUrlString="file:"+sourceUrlString;
		MicrosoftTagTypes.register();
		PHPTagTypes.register();
		PHPTagTypes.PHP_SHORT.deregister(); // remove PHP short tags for this
		// example otherwise they override
		// processing instructions
		MasonTagTypes.register();
		Source source = new Source(inputstream);// new URL(sourceUrlString));

		// Call fullSequentialParse manually as most of the source will be
		// parsed.
		source.fullSequentialParse();

		System.out.println("Document title:");
		String title = getTitle(source);
		System.out.println(title == null ? "(none)" : title);

		System.out.println("\nDocument description:");
		String description = getMetaValue(source, "description");
		System.out.println(description == null ? "(none)" : description);

		System.out.println("\nDocument keywords:");
		String keywords = getMetaValue(source, "keywords");
		System.out.println(keywords == null ? "(none)" : keywords);

		System.out.println("\nLinks to other documents:");
		List<Element> linkElements = source.getAllElements(HTMLElementName.A);
		for (Element linkElement : linkElements) {
			String href = linkElement.getAttributeValue("href");
			if (href == null)
				continue;
			// A element can contain other tags so need to extract the text from
			// it:
			String label = linkElement.getContent().getTextExtractor()
					.toString();
			System.out.println(label + " <" + href + '>');
		}

		System.out
				.println("\nAll text from file (exluding content inside SCRIPT and STYLE elements):\n");
		System.out.println(source.getTextExtractor().setIncludeAttributes(true)
				.toString());

		System.out
				.println("\nSame again but this time extend the TextExtractor class to also exclude text from P elements and any elements with class=\"control\":\n");
		TextExtractor textExtractor = new TextExtractor(source) {
			public boolean excludeElement(StartTag startTag) {
				return startTag.getName() == HTMLElementName.P
						|| "control".equalsIgnoreCase(startTag
								.getAttributeValue("class"));
			}
		};

		System.out.println(textExtractor.setIncludeAttributes(true).toString());
		return textExtractor.setIncludeAttributes(true).toString();
	}

	private static String getTitle(Source source) {
		Element titleElement = source.getFirstElement(HTMLElementName.TITLE);
		if (titleElement == null)
			return null;
		// TITLE element never contains other tags so just decode it collapsing
		// whitespace:
		return CharacterReference.decodeCollapseWhiteSpace(titleElement
				.getContent());
	}

	private static String getMetaValue(Source source, String key) {
		for (int pos = 0; pos < source.length();) {
			StartTag startTag = source.getNextStartTag(pos, "name", key, false);
			if (startTag == null)
				return null;
			if (startTag.getName() == HTMLElementName.META)
				return startTag.getAttributeValue("content"); // Attribute
			// values are
			// automatically
			// decoded
			pos = startTag.getEnd();
		}
		return null;
	}
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -