📄 parsehtml.java
字号:
/*
* FileName ParseHtml.java
* Create Time 2006-5-16 17:22:55
* Author shiwei
* Descript
* Version
*/
package com.snoics.reptile.parse;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import com.snoics.base.interfaces.log.Log;
import com.snoics.base.util.StringClass;
import com.snoics.reptile.file.CreateHTMLFile;
import com.snoics.reptile.file.ICreateFile;
import com.snoics.reptile.link.createUrl.BuildUrl;
import com.snoics.reptile.link.createUrl.IBuildUrl;
import com.snoics.reptile.regex.url.IFilterAllUrl;
import com.snoics.reptile.regex.url.IMakeUpUrl;
import com.snoics.reptile.regex.url.impl.FilterAllUrl;
import com.snoics.reptile.regex.url.impl.MakeUpUrl;
import com.snoics.reptile.system.common.Common;
import com.snoics.reptile.system.common.CommonObject;
import com.snoics.reptile.util.UrlUtil;
import com.snoics.useclass.SnoicsClass;
public class ParseHtml implements IParseFile{
private CommonObject commonObject=new CommonObject();
private UrlUtil urlUtil=new UrlUtil();
private IReplaceAllUrl replaceAllUrl=new ReplaceAllUrl();
private IBuildUrl buildUrl=new BuildUrl();
private Log log=null;
public ParseHtml(){
SnoicsClass snoicsClass=new SnoicsClass();
log=snoicsClass.getLog();
log.getLogger(this.getClass());
}
/**
* 解析当前设置的文件,将有效的URL保存起来
* @param url 相对于路径
* @param parentUrl 上级页面的URL
* @return ArrayList
*/
public ParseHtmlInfo parse(String url,String parentUrl) {
if(url==null){
return null;
}
ICreateFile createHTMLFile=new CreateHTMLFile();
String resolvedUrl="";
if(parentUrl!=null){
parentUrl=buildUrl.buildResolvedUrl(parentUrl,commonObject.getConfigInfo(Common.CONFIGFILE_NODE_WEBSITE));
resolvedUrl=buildUrl.buildResolvedUrl(url,parentUrl);
String tempResolvedUrl=StringClass.getPreString(resolvedUrl,"#");
String tempParentUrl=StringClass.getPreString(parentUrl,"#");
if(tempResolvedUrl.equalsIgnoreCase(tempParentUrl)){
return null;
}
}else{
resolvedUrl=url;
}
url=buildUrl.buildRelativizeUrl(url,parentUrl);
if(Common.createHtmlFileList.contains(url)){
return null;
}
//System.out.println("resolvedUrl="+resolvedUrl);
String resolvedFile=buildUrl.buildLocalHtmlResolvedFileName(url,parentUrl);
String newParentUrl=buildUrl.buildResolvedUrl(url,parentUrl);
IMakeUpUrl makeUpUrl=getMakeUpUrl(resolvedUrl);
String htmlString=replaceAllHtmlString(makeUpUrl);
createHTMLFile.setHtmlString(htmlString);
createHTMLFile.setResolvedUrl(resolvedUrl);
createHTMLFile.setRelativizeUrl(url);
createHTMLFile.setResolvedFile(resolvedFile);
createHTMLFile.create();
ParseHtmlInfo parseHtmlInfo=new ParseHtmlInfo();
parseHtmlInfo.setParentUrl(newParentUrl);
parseHtmlInfo.setUrlList(makeUpUrl.getRangeUrl());
return parseHtmlInfo;
}
/**
* 解析所有的URL(处于reptile-config.xml配置中url的下级的页面)
* @param startUrl 相对URL
* @param parentUrl 上级页面的URL
*/
public void parseAll(String startUrl,String parentUrl) {
startUrl = StringClass.getString(startUrl, "");
ArrayList startUrlList=StringClass.getInterString(Common.STRING_SEPARATE_FLAG,startUrl);
if(startUrlList==null){
return;
}
int startUrlListLength=startUrlList.size();
for(int length=0;length<startUrlListLength;length++){
String nowurl=(String)startUrlList.get(length);
ParseHtmlInfo parseHtmlInfo = parse(nowurl, parentUrl);
if (parseHtmlInfo == null) {
continue;
}
String subparentpageurl =parseHtmlInfo.getParentUrl();
List subPageUrlList=parseHtmlInfo.getUrlList();
if ((subPageUrlList == null) || (subPageUrlList.isEmpty())) {
continue;
}
int subPageUrlListlength=subPageUrlList.size();
for (int i = 0; i < subPageUrlListlength; i++) {
String subpageurl = (String) subPageUrlList.get(i);
parseAll(subpageurl, subparentpageurl);
}
}
}
/**
* 把获取到的URL进行分组
* @param url
* @return IMakeUpUrl
*/
private IMakeUpUrl getMakeUpUrl(String url){
IFilterAllUrl filterAllUrl=new FilterAllUrl();
IMakeUpUrl makeUpUrl=new MakeUpUrl();
//根据URL生成当前页面的HTML字符串
String htmlString=urlUtil.getHtmlString(url);
//获取当前页面中的所有的URL
filterAllUrl.setHtmlString(htmlString);
Set filterAllUrlSet=filterAllUrl.getHtmlUrl();
//把获取到的URL进行分组
makeUpUrl.setParentUrl(url);
makeUpUrl.setHtmlString(htmlString);
makeUpUrl.setAllHtmlUrl(filterAllUrlSet);
makeUpUrl.setUrlRegexList(filterAllUrl.getUrlRegexList());
makeUpUrl.makeUp();
return makeUpUrl;
}
/**
* 进行替换操作
* @param makeUpUrl
* @param parentUrl
* @return String
*/
private String replaceAllHtmlString(IMakeUpUrl makeUpUrl){
String htmlString=makeUpUrl.getHtmlString();
//被禁止抓取的URL
htmlString=replaceAllUrl.replaceForbidUrl(makeUpUrl,htmlString);
//不被下载部分的URL
htmlString=replaceAllUrl.replaceUnDownloadUrl(makeUpUrl,htmlString);
//抓取范围之内的URL
htmlString=replaceAllUrl.replaceRangeUrl(makeUpUrl,htmlString);
//下载范围之内的URL
htmlString=replaceAllUrl.replaceDownloadUrl(makeUpUrl,htmlString);
return htmlString;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -