📄 extractcontext.java

📁 网页是组成互联网的基本数据单元
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package com.newwatch.tools;

import com.newwatch.toolkit.dal.dataobject.ChannelLinkDO;
import com.newwatch.toolkit.util.StringUtil;

import org.htmlparser.Node;
import org.htmlparser.Parser;

import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;

import org.htmlparser.tags.Div;
import org.htmlparser.tags.Html;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.ParagraphTag;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.tags.SelectTag;
import org.htmlparser.tags.Span;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableHeader;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;

import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;

import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class ExtractContext {
    protected static final String lineSign = System.getProperty(
            "line.separator");
    protected static final int lineSign_size = lineSign.length();

    /**定义系统上下文**/
    public static final ApplicationContext context = new ClassPathXmlApplicationContext(new String[] {
                "newwatch/persistence.xml", "newwatch/biz-util.xml",
                "newwatch/biz-dao.xml"
            });

    /**
    * @param args
    */
    public static void main(String[] args) {
        ExtractContextConsole console = new ExtractContextConsole();
        ChannelLinkDO c = new ChannelLinkDO();
        c.setEncode("gb2312");
        c.setLink("http://www.qiche.com.cn/files/200712/12016.shtml");
        c.setLinktext("test");

        console.makeContext(c);
    }

    /**
    * 收集HTML页面信息
    * @param url
    * @param urlEncode
    */
    public void makeContext(ChannelLinkDO c) {
        String metakeywords = "<META content={0} name=keywords>";
        String metatitle = "<TITLE>{0}</TITLE>";
        String metadesc = "<META content={0} name=description>";
        String netshap = "<p> 正文快照: 时间{0}</p> ";

        String tempLeate = "<LI class=active><A href=\"{0}\" target=_blank>{1}</A></LI>";
        String crop = "<p><A href=\"{0}\" target=_blank>{1}</A></p> ";

        try {
            String siteUrl = getLinkUrl(c.getLink());
            Parser parser = new Parser(c.getLink());
            parser.setEncoding(c.getEncode());

            for (NodeIterator e = parser.elements(); e.hasMoreNodes();) {
                Node node = (Node) e.nextNode();

                if (node instanceof Html) {
                    PageContext context = new PageContext();
                    context.setNumber(0);
                    context.setTextBuffer(new StringBuffer());
                    //抓取出内容
                    extractHtml(node, context, siteUrl);

                    StringBuffer testContext = context.getTextBuffer();

                    String srcfilePath = "D:/kuaiso/site/templeate/context.vm";
                    String destfilePath = "D:/kuaiso/site/test/test.htm";
                    BufferedReader reader = new BufferedReader(new InputStreamReader(
                                new FileInputStream(srcfilePath), "gbk"));
                    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
                                new FileOutputStream(destfilePath), "gbk"));

                    String lineContext = context.getTextBuffer().toString();

                    String line;

                    while ((line = reader.readLine()) != null) {
                        int start = line.indexOf("#context");

                        if (start >= 0) {
                            String tempCrop = StringUtil.replace(crop, "{0}",
                                    c.getLink());
                            tempCrop = StringUtil.replace(tempCrop, "{1}",
                                    "      原文链接： " + c.getLink());

                            writer.write(tempCrop + lineSign);
                            writer.write(netshap + lineSign);

                            writer.write(lineContext + lineSign);

                            continue;
                        }

                        int start1 = line.indexOf("#titledesc");

                        if (start1 >= 0) {
                            String tempLine = StringUtil.replace(tempLeate,
                                    "{0}", "test.htm");
                            tempLine = StringUtil.replace(tempLine, "{1}",
                                    "标题:  " + c.getLinktext());

                            writer.write(tempLine + lineSign);

                            continue;
                        }

                        int start2 = line.indexOf("#metatitle");

                        if (start2 >= 0) {
                            metatitle = StringUtil.replace(metatitle, "{0}",
                                    c.getLinktext());

                            writer.write(metatitle + lineSign);

                            continue;
                        }

                        int start3 = line.indexOf("#metadesc");

                        if (start3 >= 0) {
                            metadesc = StringUtil.replace(metadesc, "{0}",
                                    c.getLinktext());

                            writer.write(metadesc + lineSign);

                            continue;
                        }

                        writer.write(line + lineSign);
                    }

                    writer.flush();
                    writer.close();
                    reader.close();
                }
            }
        } catch (Exception e) {
            System.out.println(e);
        }
    }

    private String getLinkUrl(String link) {
        String urlDomaiPattern = "(http://[^/]*?" + "/)(.*?)";

        Pattern pattern = Pattern.compile(urlDomaiPattern,
                Pattern.CASE_INSENSITIVE + Pattern.DOTALL);
        Matcher matcher = pattern.matcher(link);
        String url = "";

        while (matcher.find()) {
            int start = matcher.start(1);
            int end = matcher.end(1);

            url = link.substring(start, end - 1).trim();
        }

        return url;
    }

    /**
    * 递归钻取正文信息
    * @param nodeP
    * @return
    */
    protected List extractHtml(Node nodeP, PageContext context, String siteUrl)
        throws Exception {
        NodeList nodeList = nodeP.getChildren();
        boolean bl = false;

        if ((nodeList == null) || (nodeList.size() == 0)) {
            if (nodeP instanceof ParagraphTag) {
                ArrayList tableList = new ArrayList();
                StringBuffer temp = new StringBuffer();
                temp.append("<p style=\"TEXT-INDENT: 2em\">");
                tableList.add(temp);
                temp = new StringBuffer();
                temp.append("</p>").append(lineSign);
                tableList.add(temp);

                return tableList;
            }

            return null;
        }

        if ((nodeP instanceof TableTag) || (nodeP instanceof Div)) {
            bl = true;
        }

        if (nodeP instanceof ParagraphTag) {
            ArrayList tableList = new ArrayList();
            StringBuffer temp = new StringBuffer();
            temp.append("<p style=\"TEXT-INDENT: 2em\">");
            tableList.add(temp);
            extractParagraph(nodeP, siteUrl, tableList);

            temp = new StringBuffer();
            temp.append("</p>").append(lineSign);

            tableList.add(temp);

            return tableList;
        }

        ArrayList tableList = new ArrayList();

        try {
            for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) {
                Node node = (Node) e.nextNode();

                if (node instanceof LinkTag) {
                    tableList.add(node);
                    setLinkImg(node, siteUrl);
                } else if (node instanceof ImageTag) {
                    ImageTag img = (ImageTag) node;

                    if (img.getImageURL().toLowerCase().indexOf("http://") < 0) {
                        img.setImageURL(siteUrl + img.getImageURL());
                    } else {
                        img.setImageURL(img.getImageURL());
                    }

                    tableList.add(node);
                } else if (node instanceof ScriptTag ||
                        node instanceof StyleTag || node instanceof SelectTag) {
                } else if (node instanceof TextNode) {
                    if (node.getText().length() > 0) {
                        StringBuffer temp = new StringBuffer();
                        String text = collapse(node.getText()
                                                   .replaceAll("&nbsp;", "")
                                                   .replaceAll("　", ""));

                        temp.append(text.trim());

                        tableList.add(temp);
                    }
                } else {
                    if (node instanceof TableTag || node instanceof Div) {
                        TableValid tableValid = new TableValid();
                        isValidTable(node, tableValid);

                        if (tableValid.getTrnum() > 2) {
                            tableList.add(node);

                            continue;
                        }
                    }

                    List tempList = extractHtml(node, context, siteUrl);

                    if ((tempList != null) && (tempList.size() > 0)) {
                        Iterator ti = tempList.iterator();

                        while (ti.hasNext()) {
                            tableList.add(ti.next());
                        }
                    }
                }
            }
        } catch (Exception e) {
            return null;
        }

        if ((tableList != null) && (tableList.size() > 0)) {
            if (bl) {
                StringBuffer temp = new StringBuffer();
                Iterator ti = tableList.iterator();
                int wordSize = 0;
                StringBuffer node;
                int status = 0;
                StringBuffer lineStart = new StringBuffer(
                        "<p style=\"TEXT-INDENT: 2em\">");
                StringBuffer lineEnd = new StringBuffer("</p>" + lineSign);

                while (ti.hasNext()) {
                    Object k = ti.next();

                    if (k instanceof LinkTag) {
                        if (status == 0) {
                            temp.append(lineStart);
                            status = 1;
                        }

                        node = new StringBuffer(((LinkTag) k).toHtml());
                        temp.append(node);
                    } else if (k instanceof ImageTag) {
                        if (status == 0) {
                            temp.append(lineStart);
                            status = 1;
                        }

                        node = new StringBuffer(((ImageTag) k).toHtml());
                        temp.append(node);
                    } else if (k instanceof TableTag) {
                        if (status == 0) {
                            temp.append(lineStart);
                            status = 1;
                        }

                        node = new StringBuffer(((TableTag) k).toHtml());
                        temp.append(node);
                    } else if (k instanceof Div) {
                        if (status == 0) {
                            temp.append(lineStart);
                            status = 1;
                        }

                        node = new StringBuffer(((Div) k).toHtml());
                        temp.append(node);
                    } else {
                        node = (StringBuffer) k;

                        if (status == 0) {
                            if (node.indexOf("<p") < 0) {
                                temp.append(lineStart);
                                temp.append(node);
                                wordSize = wordSize + node.length();
                                status = 1;
                            } else {
                                temp.append(node);
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -