📄 webparser.java
字号:
/*
* 除去html源码中的空格,提取正文存档txt文件
*/
package com.catking.webSearcher;
import java.net.*;
import java.io.*;
public class WebParser {
/**
* @param args
*/
private static String txtFileName = "index.txt";
public static void main(String[] args) {
// TODO Auto-generated method stub
try{
parser();
}catch(IOException e){
System.out.println("下载失败");
System.exit(1);
}
}
public static void parser() throws IOException{
File htmlFile = new File(WebCrawler.htmlFileName);
FileReader reader = new FileReader(htmlFile);
File txtFile = new File(txtFileName);
FileWriter writer = new FileWriter(txtFile);
boolean firstMarkNotYet = true;
boolean isContent = true;
StringBuffer sb = new StringBuffer(8096*2);
char[] cBuffer = new char[8096*2];
int length = reader.read(cBuffer);
for(int i=0; i < length; ++i){
if(cBuffer[i] != '<' && firstMarkNotYet){
continue;
}//尚未遇到第一个<
else if(cBuffer[i] == '<'){
isContent = false;
firstMarkNotYet = false;
continue;
}//第一次遇到<
if(cBuffer[i] != '>' && !isContent){
continue;
}//标签内
else if(cBuffer[i] == '>'){
isContent = true;
continue;
}//标签结束
//消除标点以及空格换行 12288-->. 183-->" " 32-->" "
if(cBuffer[i] == '.' || cBuffer[i] == '?'
|| cBuffer[i] == '\n' || (int)cBuffer[i] == 32 || cBuffer[i] == '\t'
|| (int)cBuffer[i] == 12288 || (int)cBuffer[i] == 183)
continue;
//消除
if(cBuffer[i] == '&' && cBuffer[i+1] == 'n' && cBuffer[i+2] == 'b'
&& cBuffer[i+3] == 's' && cBuffer[i+4] == 'p' && cBuffer[i+5] == ';'){
i = i + 5;
continue;
}
//消除//--
if(cBuffer[i] == '/' && cBuffer[i+1] == '/'
&& cBuffer[i+2] == '-' && cBuffer[i+3] == '-'){
i = i + 3;
continue;
}
//记录正文
System.out.print("*"+cBuffer[i]);
System.out.println((int)cBuffer[i]+"*");
sb.append(cBuffer[i]);
}
writer.append(sb.toString());
reader.close();
writer.close();
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -