📄 concatenator.java
字号:
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University * * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;import websphinx.util.Str;import java.io.*;import java.net.URL;import java.net.MalformedURLException;import java.util.Hashtable;/** * Transformer that concatenates multiple pages * into a single HTML page. * <P> * The entire set of pages is preceded by a "prolog" * and followed by an "epilog", which are constant * strings of HTML. Each page is preceded * by a "header" and followed by a "footer". Adjacent pages * are separated by a "divider". * <P> * Concatenator performs the following * transformations on pages before appending them together: * <UL> * <LI> deletes elements that would conflict, including * <HEADf>, <TITLEf>, <BODYf>, <HTMLf>, * <STYLE>, and <FRAMES>. * <LI> deletes <BASEf> or replaces it with a user-specified * <BASEf> * <LI> changes links among the written pages into * in-page references, of the form "#concatenator_N" * <LI> changes links to other pages into absolute references * </UL> * */ // FIX: transform anchorspublic class Concatenator extends RewritableLinkTransformer { boolean needRewrite = false; public static String defaultProlog = "<HTML><HEAD><TITLE>Concatenation</TITLE></HEAD><BODY>\n"; public static String defaultHeader = "<TABLE WIDTH=\"100%\"><TR>\n" +"<TD ALIGN=left><A NAME=\"%a\">%t [%u]</A>\n" +"<TD ALIGN=right>Page %p</TABLE>\n"; public static String defaultFooter = ""; public static String defaultDivider = "\n<DIV STYLE=\"page-break-after: always;\"><HR></DIV>\n"; public static String defaultEpilog = "\n</BODY></HTML>\n"; String prolog = defaultProlog; String header = defaultHeader; String footer = defaultFooter; String divider = defaultDivider; String epilog = defaultEpilog; int nPages = 0; /** * Make a new Concatenator that writes to a file. * @param filename Filename to write concatenated pages to * @exception IOException if file cannot be opened */ public Concatenator (String filename) throws IOException { super (makeDirs(filename)); } private static String makeDirs (String filename) throws IOException { File file = new File (filename); File parent = new File (file.getParent ()); if (parent != null) SecurityPolicy.getPolicy().makeDir (parent); return filename; } /** * Set the prolog. * @param prolog string of HTML that is emitted at the beginning * of the concatenation. Default value is: <BR> * <CODE><HTML><HEAD><TITLE>Concatenation</TITLE></HEAD><BODY>\n</CODE> */ public synchronized void setProlog (String prolog) { this.prolog = prolog; } /** * Get the prolog. * @return string of HTML that is emitted at the beginning * of the concatenation. */ public String getProlog () { return prolog; } /** * Set the header. The header can contain macro codes which * are replaced with attributes of the page about to be written: * <DL> * <DT>%t * <DD>title of the page * <DT>%u * <DD>URL of page * <DT>%a * <DD>anchor name of the page ("pageN", where N is the page number) * <DT>%p * <DD>page number (starting from 1) * </DL> * @param header string of HTML that is emitted before * each page. The default value is:<BR> * <CODE> <TABLE WIDTH="100%"><TR>\n <BR> * <TD ALIGN=left><A NAME="%a">%t [%u]</A>\n <BR> * <TD ALIGN=right>Page %p</TABLE>\n</CODE> */ public synchronized void setPageHeader (String header) { this.header = header; } /** * Get the header. * @return string of HTML that is emitted before * each page. */ public String getPageHeader () { return header; } /** * Set the footer. The footer can contain the same * macros as the header (%t, %u, %a, %p); see setPageHeader * for more details. * @param footer string of HTML that is emitted after * each page. */ public synchronized void setPageFooter (String footer) { this.footer = footer; } /** * Get the footer. * @return string of HTML that is emitted after * each page. */ public String getPageFooter () { return footer; } /** * Set the divider. * @param divider string of HTML that is emitted between * each pair of pages. */ public synchronized void setDivider (String divider) { this.divider = divider; } /** * Get the divider. * @return string of HTML that is emitted between * each pair of pages. */ public String getDivider () { return divider; } /** * Set the epilog. * @param epilog string of HTML that is emitted after * the entire concatenation. */ public synchronized void setEpilog (String epilog) { this.epilog = epilog; } /** * Get the epilog. * @return string of HTML that is emitted after * the entire concatenation. */ public String getEpilog () { return epilog; } /** * Get number of pages written to this mirror. * @return number of calls to writePage() on this mirror */ public synchronized int getPageCount () { return nPages; } /** * Rewrite the concatenation. Makes sure all the links * among concatenated pages have been fixed up. */ public synchronized void rewrite () throws IOException { if (needRewrite) { super.rewrite (); needRewrite = false; } } /** * Close the concatenation. Makes sure all the links * among concatenated pages have been fixed up and closes * the file. */ public synchronized void close () throws IOException { if (nPages == 0) write (prolog); emit (epilog); rewrite (); super.close (); } /** * Write a page to the concatenation. * @param page Page to write */ public synchronized void writePage (Page page) throws IOException { ++nPages; emit ((nPages == 1) ? prolog : divider); String title = page.getTitle (); URL url = page.getURL (); String urlString = url.toExternalForm (); String anchor = "page" + nPages; map (url, "#" + anchor); emitTemplate (header, title, urlString, anchor, nPages); if (page.isImage () && page.getURL() != null) super.write ("<IMG SRC='" + page.getURL() + "'>"); else if (page.isHTML()) // it's HTML, can write it normally super.writePage (page); else super.write (page.toHTML()); emitTemplate (footer, title, urlString, anchor, nPages); needRewrite = nPages > 1; } private void emitTemplate (String template, String title, String url, String anchor, int pages) throws IOException { if (template == null || template.length() == 0) return; template = Str.replace (template, "%t", title != null ? title : ""); template = Str.replace (template, "%u", url != null ? url : ""); template = Str.replace (template, "%a", anchor != null ? anchor : ""); template = Str.replace (template, "%p", String.valueOf (pages)); emit (template); } /** * Process an HTML element for concatenation. Deletes * tags that would * conflict with other pages (such as <HEAD>), * changes the URLs in Link elements, and deletes * or remaps the BASE element. * @param elem HTML element to process */ protected void handleElement (Element elem) throws IOException { String name = elem.getTagName (); if ( name == Tag.TITLE || name == Tag.STYLE || name == Tag.BASE || name == Tag.ISINDEX || name == Tag.FRAMESET || name == Tag.FRAME) { // skip the entire element } else if ( name == Tag.HTML || name == Tag.HEAD || name == Tag.BODY || name == Tag.NOFRAMES) { // skip only the start and end tags; preserve the content transformContents (elem); } else super.handleElement (elem); } /* * Testing * * * */ public static void main (String[] args) throws Exception { HTMLTransformer out = new Concatenator (args[args.length-1]); for (int i=0; i<args.length-1; ++i) { Link link = new Link (args[i]); Page page = new Page (link); out.writePage (page); } out.close (); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -