⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 extractcontext.java

📁 网页是组成互联网的基本数据单元
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
                                status = 1;
                            }
                        } else if (status == 1) {
                            if (node.indexOf("</p") < 0) {
                                if (node.indexOf("<p") < 0) {
                                    temp.append(node);
                                    wordSize = wordSize + node.length();
                                } else {
                                    temp.append(lineEnd);
                                    temp.append(node);
                                    status = 1;
                                }
                            } else {
                                temp.append(node);
                                status = 0;
                            }
                        }
                    }
                }

                if (status == 1) {
                    temp.append(lineEnd);
                }

                if (wordSize > context.getNumber()) {
                    context.setNumber(wordSize);
                    context.setTextBuffer(temp);
                }

                return null;
            } else {
                return tableList;
            }
        }

        return null;
    }

    /**
    * 设置图象连接
    * @param nodeP
    * @param siteUrl
    */
    private void setLinkImg(Node nodeP, String siteUrl) {
        NodeList nodeList = nodeP.getChildren();

        try {
            for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) {
                Node node = (Node) e.nextNode();

                if (node instanceof ImageTag) {
                    ImageTag img = (ImageTag) node;

                    if (img.getImageURL().toLowerCase().indexOf("http://") < 0) {
                        img.setImageURL(siteUrl + img.getImageURL());
                    } else {
                        img.setImageURL(img.getImageURL());
                    }
                }
            }
        } catch (Exception e) {
            return;
        }

        return;
    }

    /**
    * 钻取段落中的内容
    * @param nodeP
    * @param siteUrl
    * @param tableList
    * @return
    */
    private List extractParagraph(Node nodeP, String siteUrl, List tableList) {
        NodeList nodeList = nodeP.getChildren();

        if ((nodeList == null) || (nodeList.size() == 0)) {
            if (nodeP instanceof ParagraphTag) {
                StringBuffer temp = new StringBuffer();
                temp.append("<p style=\"TEXT-INDENT: 2em\">");
                tableList.add(temp);
                temp = new StringBuffer();
                temp.append("</p>").append(lineSign);
                tableList.add(temp);

                return tableList;
            }

            return null;
        }

        try {
            for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) {
                Node node = (Node) e.nextNode();

                if (node instanceof ScriptTag || node instanceof StyleTag ||
                        node instanceof SelectTag) {
                } else if (node instanceof LinkTag) {
                    tableList.add(node);
                    setLinkImg(node, siteUrl);
                } else if (node instanceof ImageTag) {
                    ImageTag img = (ImageTag) node;

                    if (img.getImageURL().toLowerCase().indexOf("http://") < 0) {
                        img.setImageURL(siteUrl + img.getImageURL());
                    } else {
                        img.setImageURL(img.getImageURL());
                    }

                    tableList.add(node);
                } else if (node instanceof TextNode) {
                    if (node.getText().trim().length() > 0) {
                        String text = collapse(node.getText()
                                                   .replaceAll("&nbsp;", "")
                                                   .replaceAll(" ", ""));
                        StringBuffer temp = new StringBuffer();
                        temp.append(text);
                        tableList.add(temp);
                    }
                } else if (node instanceof Span) {
                    StringBuffer spanWord = new StringBuffer();
                    getSpanWord(node, spanWord);

                    if ((spanWord != null) && (spanWord.length() > 0)) {
                        String text = collapse(spanWord.toString()
                                                       .replaceAll("&nbsp;", "")
                                                       .replaceAll(" ", ""));

                        StringBuffer temp = new StringBuffer();
                        temp.append(text);
                        tableList.add(temp);
                    }
                } else if (node instanceof TagNode) {
                    String tag = node.toHtml();

                    if (tag.length() <= 10) {
                        tag = tag.toLowerCase();

                        if ((tag.indexOf("strong") >= 0) ||
                                (tag.indexOf("b") >= 0)) {
                            StringBuffer temp = new StringBuffer();
                            temp.append(tag);
                            tableList.add(temp);
                        }
                    } else {
                        if (node instanceof TableTag || node instanceof Div) {
                            TableValid tableValid = new TableValid();
                            isValidTable(node, tableValid);

                            if (tableValid.getTrnum() > 2) {
                                tableList.add(node);

                                continue;
                            }
                        }

                        extractParagraph(node, siteUrl, tableList);
                    }
                }
            }
        } catch (Exception e) {
            return null;
        }

        return tableList;
    }

    protected void getSpanWord(Node nodeP, StringBuffer spanWord) {
        NodeList nodeList = nodeP.getChildren();

        try {
            for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) {
                Node node = (Node) e.nextNode();

                if (node instanceof ScriptTag || node instanceof StyleTag ||
                        node instanceof SelectTag) {
                } else if (node instanceof TextNode) {
                    spanWord.append(node.getText());
                } else if (node instanceof Span) {
                    getSpanWord(node, spanWord);
                } else if (node instanceof ParagraphTag) {
                    getSpanWord(node, spanWord);
                } else if (node instanceof TagNode) {
                    String tag = node.toHtml().toLowerCase();

                    if (tag.length() <= 10) {
                        if ((tag.indexOf("strong") >= 0) ||
                                (tag.indexOf("b") >= 0)) {
                            spanWord.append(tag);
                        }
                    }
                }
            }
        } catch (Exception e) {
        }

        return;
    }

    /**
    * 判断TABLE是否是表单
    * @param nodeP
    * @return
    */
    private void isValidTable(Node nodeP, TableValid tableValid) {
        NodeList nodeList = nodeP.getChildren();

        /**如果该表单没有子节点则返回**/
        if ((nodeList == null) || (nodeList.size() == 0)) {
            return;
        }

        try {
            for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) {
                Node node = (Node) e.nextNode();

                /**如果子节点本身也是表单则返回**/
                if (node instanceof TableTag || node instanceof Div) {
                    return;
                } else if (node instanceof ScriptTag ||
                        node instanceof StyleTag || node instanceof SelectTag) {
                    return;
                } else if (node instanceof TableColumn) {
                    return;
                } else if (node instanceof TableRow) {
                    TableColumnValid tcValid = new TableColumnValid();
                    tcValid.setValid(true);
                    findTD(node, tcValid);

                    if (tcValid.isValid()) {
                        if (tcValid.getTdNum() < 2) {
                            if (tableValid.getTdnum() > 0) {
                                return;
                            } else {
                                continue;
                            }
                        } else {
                            if (tableValid.getTdnum() == 0) {
                                tableValid.setTdnum(tcValid.getTdNum());
                                tableValid.setTrnum(tableValid.getTrnum() + 1);
                            } else {
                                if (tableValid.getTdnum() == tcValid.getTdNum()) {
                                    tableValid.setTrnum(tableValid.getTrnum() +
                                        1);
                                } else {
                                    return;
                                }
                            }
                        }
                    }
                } else {
                    isValidTable(node, tableValid);
                }
            }
        } catch (Exception e) {
            return;
        }

        return;
    }

    /**
    * 判断是否有效TR
    * @param nodeP
    * @param TcValid
    * @return
    */
    private void findTD(Node nodeP, TableColumnValid tcValid) {
        NodeList nodeList = nodeP.getChildren();

        /**如果该表单没有子节点则返回**/
        if ((nodeList == null) || (nodeList.size() == 0)) {
            return;
        }

        try {
            for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) {
                Node node = (Node) e.nextNode();

                /**如果有嵌套表单**/
                if (node instanceof TableTag || node instanceof Div ||
                        node instanceof TableRow ||
                        node instanceof TableHeader) {
                    tcValid.setValid(false);

                    return;
                } else if (node instanceof ScriptTag ||
                        node instanceof StyleTag || node instanceof SelectTag) {
                    tcValid.setValid(false);

                    return;
                } else if (node instanceof TableColumn) {
                    tcValid.setTdNum(tcValid.getTdNum() + 1);
                } else {
                    findTD(node, tcValid);
                }
            }
        } catch (Exception e) {
            tcValid.setValid(false);

            return;
        }

        return;
    }

    protected String collapse(String string) {
        int chars;
        int length;
        int state;
        char character;
        StringBuffer buffer = new StringBuffer();
        chars = string.length();

        if (0 != chars) {
            length = buffer.length();
            state = ((0 == length) || (buffer.charAt(length - 1) == ' ') ||
                ((lineSign_size <= length) &&
                buffer.substring(length - lineSign_size, length).equals(lineSign)))
                ? 0 : 1;

            for (int i = 0; i < chars; i++) {
                character = string.charAt(i);

                switch (character) {
                case '\u0020':
                case '\u0009':
                case '\u000C':
                case '\u200B':
                case '\u00a0':
                case '\r':
                case '\n':

                    if (0 != state) {
                        state = 1;
                    }

                    break;

                default:

                    if (1 == state) {
                        buffer.append(' ');
                    }

                    state = 2;
                    buffer.append(character);
                }
            }
        }

        return buffer.toString();
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -