📄 filerlocal.java
字号:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class FilerLocal {
public static void main(String[] args) throws Exception
{
long start_time = System.currentTimeMillis();//ms
String aFile = "d:\\cs\\china\\6.htm";
//String aFile = "d:\\cs\\tx.htm";
String content = readTextFile(aFile, "GBK");
//System.out.println(content);
System.out.println("新闻标题是:");
testtitle(content);
System.out.println("新闻内容是:");
testcontent(content);
System.out.println("核心内容是:");
// outline(content);
long end_time = System.currentTimeMillis();
System.out.println("time is " + (end_time - start_time));
}
private static void testtitle(String content) throws ParserException{
String sbStr="";
try{
Parser myParser=new Parser(content);
NodeFilter filter = new TagNameFilter("title");//
NodeList nl = myParser.extractAllNodesThatMatch(filter);
for (int i = 0; i < nl.size(); i++)
{
//System.out.println(replaceTitle(nl.elementAt(i).toHtml()));
sbStr=sbStr+replaceTitle(nl.elementAt(i).toHtml());
}
}
catch (Exception e)
{
//LogMan.error("read Text File Error", e);
}
// return getTitle(sbStr);
System.out.println(getTitle(sbStr));
}
private static void testcontent(String content) throws ParserException{
String sbStr;
int p;
int q;
int href;
int img;
int ss=0;
Parser myParser=new Parser(content);
NodeFilter filter = new TagNameFilter("p");//
NodeList nl = myParser.extractAllNodesThatMatch(filter);
if(nl.size()!=0){
for (int i = 0; i < nl.size(); i++)
{
// System.out.println(nl.elementAt(i).toHtml());
String t=nl.elementAt(i).toHtml();
href=t.indexOf("href=");
img=t.indexOf("<IMG");
if(href!=-1){
ss=cleanLink(t);
}
if((href!=-1&&img==-1&&ss!=0)||href==-1)
{
sbStr=replaceHtml(nl.elementAt(i).toHtml());
System.out.println(showContent(sbStr));
// System.out.println();
}
}//for循环结束
}//if结束
else
{
myParser.reset();
filter = new TagNameFilter("div");//
nl= myParser.extractAllNodesThatMatch(filter);
// System.out.println(nl.size());
for (int i = 0; i < nl.size(); i++)
{
//String t=nl.elementAt(i).toHtml();
String text =nl.elementAt(i).toHtml();
String sub ="<div";
int s=count(text,sub);
href=text.indexOf("href=");
if(s==1)
{
// System.out.println(nl.elementAt(i).toHtml());
sbStr=replaceHtml(nl.elementAt(i).toHtml());
System.out.println(showContent(sbStr));
}
}
}
System.out.println("==============================");
/*输出强调的字*/
for(int i=0; i<nl.size(); i++ ){
String t=nl.elementAt(i).toHtml();
if(t.indexOf("<STRONG>")!=-1){
p=t.indexOf("<STRONG>");
q=t.indexOf("</STRONG>");
sbStr=t.substring(p+8, q);
//System.out.println(sbStr);
}
}
//String t=replaceHtml(sbStr);
//System.out.println(t);
//System.out.println(showContent(t));
}
//输出核心提示
/* private static void outline(String content) throws ParserException{
int href;
//String sbStr="";
Parser myParser=new Parser(content);
NodeFilter filter = new TagNameFilter("ul");//
NodeList nl = myParser.extractAllNodesThatMatch(filter);
for (int i = 0; i < nl.size(); i++)
{
String t=nl.elementAt(i).toHtml();
System.out.println(t);
href=t.indexOf("href=");
if(href==-1)
{
//sbStr=sbStr+replaceHtml(nl.elementAt(i).toHtml());
System.out.println(replaceHtml(nl.elementAt(i).toHtml()));
}
}
// return getTitle(sbStr);
// System.out.println(getTitle(sbStr));
}
*/
public static int cleanLink(String html){
String t=replaceLink(html);
int sin=0;
int a1;
int an;
String fstr;
String lstr;
//a1=t.indexOf(replaceLink("<a"));
//an=t.lastIndexOf(replaceLink("</a>"));
a1=t.indexOf("<A");
an=t.lastIndexOf("</A>");
fstr=t.substring(0,a1);
if(fstr.getBytes().length != fstr.length())
sin++;
lstr=t.substring(an+4,t.length());
if(lstr.getBytes().length != lstr.length())
sin++;
/* if(fstr.matches("[^\u4E00-\u9FA5]*")){
sin++;
}
lstr=t.substring(an+4,t.length());
if(lstr.matches("[^\u4E00-\u9FA5]*")){
sin++;
}
*/
return sin;
}
public static String readTextFile(String sFileName, String sEncode)
{
StringBuffer sbStr = new StringBuffer();
try
{
File ff = new File(sFileName);
InputStreamReader read = new InputStreamReader(new FileInputStream(ff),
sEncode);
BufferedReader ins = new BufferedReader(read);
String dataLine = "";
while (null != (dataLine = ins.readLine()))
{
sbStr.append(dataLine);
sbStr.append(" ");
}
ins.close();
}
catch (Exception e)
{
e.printStackTrace();
}
return sbStr.toString();
}
public static String getTitle(String str)
{
String rstr=replaceTitle(str);
String[] array=rstr.split("-");
return array[0].toString();
}
public static String showContent(String content)
{
StringBuffer sbStr = new StringBuffer();
int linenum=40;//每行显示字符数
int i=0;
while(i<content.length())
{
if((i+linenum)<content.length())
{
sbStr=sbStr.append(content.substring(i,i+linenum));
}
else
{
sbStr=sbStr.append(content.substring(i,content.length()));
}
sbStr=sbStr.append("\r\n");
i=i+linenum;
}
return sbStr.toString();
}
public static String replaceLink(String html)
{
html=html.replaceAll("<a|<A", "<A");
html=html.replaceAll("</a>|</A>","</A>");
return html;
}
public static String replaceTitle(String html)
{
html=html.replaceAll("<.*?>","");
html=html.replaceAll("-|_", "-");
return html;
}
public static String replaceHtml(String html){
//html=html.replaceAll("<a href[^>]*>.*</a>","");
html=html.replaceAll(">","");
html=html.replaceAll(">","");
html=html.replaceAll(" ","");
html=html.replaceAll("<SCRIPT[^>]*>.*</SCRIPT>","");
html=html.replaceAll("<style[^>]*>.*</style>","");
html=html.replaceAll("<!--.*?-->","");
html=html.replaceAll("<.*?>","");
html=html.replaceAll("·","");
html=html.replaceAll(""","");
return html;
}
/**
*计算一个字符串在另一个字符串中出现的次数
*@author : zhuzhu
*/
public static int count(String text,String sub){
int count =0, start =0;
while((start=text.indexOf(sub,start))>=0){
start += sub.length();
count ++;
}
return count;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -