📄 html2xml.java
字号:
import java.net.URL;
import java.io.*;
import org.w3c.tidy.Configuration;
import org.w3c.tidy.Tidy;
public class HTML2XML {
private String url;
private String outFileName;
private String errOutFileName;
public HTML2XML(String url,String outFileName,String
errOutFileName) {
this.url = url;
this.outFileName = outFileName;
this.errOutFileName = errOutFileName;
}
public void convert() {
URL u;
BufferedInputStream in;
FileOutputStream out;
Tidy tidy = new Tidy();
//Tell Tidy to convert HTML to XML
tidy.setXmlOut(true);
//tidy.setDropFontTags(true); // 删除字体节点
//tidy.setDropEmptyParas(true); // 删除空段落
//tidy.setFixComments(true); // 修复注释
//tidy.setFixBackslash(true); // 修复反斜杆
//tidy.setMakeClean(true); // 删除混乱的表示
//tidy.setQuoteNbsp(false); // 将空格输出为
//tidy.setQuoteMarks(false); // 将双引号输出为 "
//tidy.setQuoteAmpersand(true); // 将 & 输出为 &
tidy.setCharEncoding(Configuration.RAW);
tidy.setXmlPi(true);
try {
//Set file for error messages
tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true));
u = new URL(url);
//Create input and output streams
in = new BufferedInputStream(u.openStream());
out = new FileOutputStream(outFileName);
//Convert files
tidy.parse(in, out);
//Clean up
in.close();
out.close();
}
catch (IOException e) {
System.out.println(this.toString() + e.toString());
}
}
public static void main(String args[]) {
/*
* Parameters are:
* URL of HTML file
* Filename of output file r
* Filename of error file
*/
HTML2XML t = new HTML2XML("http://www.imdb.com/title/tt0068646/","wanfang.xml","wanfang.txt");
t.convert();
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -