📄 spider.java
字号:
try{
URL urlHost = new URL (url.getProtocol(),url.getHost(),"/");
if(true){ //check robots.txt under host root only //url.toString().equalsIgnoreCase(urlHost.toString())){
try{
URL hostRobots = new URL (url.getProtocol(),url.getHost(),"/robots.txt");
log("Checking : "+hostRobots.toString());
//open robots.txt
URLConnection connection = hostRobots.openConnection();
InputStream is = connection.getInputStream();
Reader r = new InputStreamReader(is);
BufferedReader stdio = new BufferedReader (r);
String robotsLine;
String desc;
String dir;
URL forbidenUrl;
while((robotsLine = stdio.readLine()) !=null){ //read until the file ends
if(robotsLine.indexOf(":")!=-1 ){
desc = (robotsLine.substring(robotsLine.indexOf(":")+1,robotsLine.length())).trim();
if( desc.equals("*") || desc.equals("Baiduspider"))
{
if((robotsLine = stdio.readLine()) !=null && robotsLine.indexOf(":")<robotsLine.length()-1){
dir = (robotsLine.substring(robotsLine.indexOf(":")+3,robotsLine.length())).trim();
forbidenUrl= new URL(urlHost.toString()+dir);
if(!getWorkloadForbiden().contains(forbidenUrl)){
getWorkloadForbiden().add(forbidenUrl);
}
log("Add to workloadForbiden: "+urlHost.toString()+dir);
}
}
}
}
}catch (IOException e){
;
}
}
}catch (MalformedURLException ex) {
}
}
public boolean checkIfThisUrlForbiden(URL url) //@Author Kelven.JU
{
String urlString = url.toString();
String urlDirString = urlString.substring(0,urlString.lastIndexOf("/")+1);
try
{
URL urlDir = new URL(urlDirString);
log(urlDir.toString());
if(getWorkloadForbiden().contains(urlDir))
{
log(url.toString()+" IS NOT allowed to be Crawled");
return false;
}
else{
log(url.toString()+" IS allowed to be Crawled");
return true;
}
}
catch (MalformedURLException ex)
{
System.out.println("Error accourd at function checkIfThisUrlForbiden");
return false;
}
}
/**
* Called internally to process a URL
*
* @param url The URL to be processed.
*/
public void processURL(URL url)
{
boolean parseableUrl = true;
try {
log("Processing: " + url );
//log("Host: "+url.getHost());
//fetch the file type
String urlString = url.toString();
String currentFileType ="";
if(urlString.lastIndexOf(".")>urlString.lastIndexOf("/")){
currentFileType = urlString.substring(urlString.lastIndexOf(".")+1,urlString.length());
if(workloadFileType.contains(currentFileType))
parseableUrl = false;
}
/******************************************/
checkHostRobots(url);
if(checkIfThisUrlForbiden(url))
{
//URL processing completes, log event
getWorkloadWaiting().remove(url);
getWorkloadProcessed().add(url);
log("Complete: " + url );
}
/******************************************/
//open URL
URLConnection connection = url.openConnection();
//if parseable
if(parseableUrl){
//if the Content Type is null or not starts with "text/"
if ( (connection.getContentType()!=null) && !connection.getContentType().toLowerCase().startsWith("text/") ) {
getWorkloadWaiting().remove(url);
getWorkloadProcessed().add(url);
log("Not processing because content type is: " +
connection.getContentType() );
return;
}
}
else{
if(connection.getContent()!=null){
log("Not parse but complete (file type: "+currentFileType+"):"+connection.getURL().toString());
getWorkloadWaiting().remove(url);
getWorkloadProcessed().add(url);
return;
}
else{
getWorkloadWaiting().remove(url);
getWorkloadError().add(url);
log("Error:"+connection.getURL().toString());
report.spiderURLError(url);
return;
}
}
//download web page
InputStream is = connection.getInputStream();
Reader r = new InputStreamReader(is);
// parse the URL
scoreOfPage=0.0;
HTMLEditorKit.Parser parse = new HTMLParse().getParser();
parse.parse(r,new Parser(url),true);
//分词,统计得分
//System.out.println("Page Scoue = "+getPageScore(url.toString().replace(":","").replace("/","").replace(".","")+".tmp"));
//File tmpFileDel=new File("tmp/"+url.toString().replace(":","").replace("/","").replace(".","")+".tmp");
//tmpFileDel.delete();
log("Page Score ["+url.toString()+"]: "+scoreOfPage);
report.spiderOutputPageScore(url, scoreOfPage);
resultOut.newLine();
resultOut.write(new Date()+"[完成]"+"{得分"+scoreOfPage+"} ->"+url);
resultOut.flush();
} catch ( IOException e ) { //if any error during processing the URL
getWorkloadWaiting().remove(url);
getWorkloadError().add(url);
log("Error: " + url );
report.spiderURLError(url);
return;
}
//URL processing completes, log event
getWorkloadWaiting().remove(url);
getWorkloadProcessed().add(url);
log("Complete: " + url );
}
//call to begin the spider
public void begin()
{
cancel = false;
if(checkRobotsOption && !checkMetaTagOption){
log("Begin WITH checking robots.txt but WITHOUT mata tags!");
}else if(checkMetaTagOption && !checkRobotsOption){
log("Begin WITH checking Meta Tags but WITHOUT robots.txt!");
}else if(checkRobotsOption && checkMetaTagOption){
log("Begin WITH checking robots.txt and meta tags!");
}else{
log("Begin WITHOUT checking robots.txt and meta tags!");
}
while ( !getWorkloadWaiting().isEmpty() && !cancel ) {
Object list[] = getWorkloadWaiting().toArray();
for ( int i=0;(i<list.length)&&!cancel;i++ )
processURL((URL)list[i]);
}
}
/*** Connection Timer***/
/** * HTML parser@Author Kelven.JU* **/
protected class Parser
extends HTMLEditorKit.ParserCallback {
protected URL base;
//friendly crow
protected boolean indexEnable;
protected boolean followEnable;
//file to output the textual content
protected BufferedWriter fileOut ;
protected File tmpTextFilePath;
public Parser(URL base)
{
this.base = base;
indexEnable = true;
followEnable = true;
try{
tmpTextFilePath=new File("tmp");
tmpTextFilePath.mkdir();
fileOut=new BufferedWriter ( new FileWriter (new File("tmp/"+base.toString().replace(":","").replace("/","").replace(".","")+".tmp")));
}catch(IOException e){
System.out.println("Construtor Parser [Spider.java 623] IO error!");
}
}
//process textual content
public void handleText(char[] text, int position){
String tmpString=new String(text);
int tmpIndex=-1;
for(int i=0;i<wordList.size();i++){
while((tmpIndex=tmpString.indexOf((String)wordList.get(i),tmpIndex+1))!=-1){
scoreOfPage=scoreOfPage+(Double)wordWeight.get(i);
}
//System.out.println(scoreOfPage);
}
//log("Outputing texts into file : "+base.toString().replace(":","").replace("/","").replace(".","")+".tmp");
/*try{
//fileOut.write(text);
//fileOut.newLine();
//fileOut.flush();
//fileOut.close();
}catch (IOException e){
System.out.println("Function handleText [Spider.java 391] IO error!");}
*/
}
public void handleSimpleTag(HTML.Tag t,
MutableAttributeSet a,int pos)
{
if(t == HTML.Tag.META){ //chekc META TAG
String metaName = (String)a.getAttribute(HTML.Attribute.NAME );
String metaContent = (String)a.getAttribute(HTML.Attribute.CONTENT );
if(metaName != null && metaContent != null){
metaName = metaName.toLowerCase();
metaContent = metaContent.toLowerCase();
if( metaName.equals("robots"))
{
int j = metaContent.indexOf(',');
if(j!=-1)
{
String metaContent1 = metaContent.substring(0,j);
String metaContent2 = metaContent.substring(j,metaContent.length());
if(metaContent1.equals("noindex") ||metaContent2.equals("noindex"))
indexEnable = false;
else if(metaContent1.equals("nofollow") ||metaContent2.equals("nofollow"))
followEnable = false;
else
{;}
}
else
{
if(metaContent.equals("noindex")){
indexEnable = false;
}
else if(metaContent.equals("nofollow")){
followEnable = false;
}
else if(metaContent.equals("none")){
indexEnable = false;
followEnable = false;
}
else
{;}
}
}
}
//if((String)a.getAttribute(HTML.Attribute.CONTENT ) == "robots")
}
String href = (String)a.getAttribute(HTML.Attribute.HREF);
//handle frame
if( (href==null) && (t==HTML.Tag.FRAME) )
href = (String)a.getAttribute(HTML.Attribute.SRC);
if ( href==null )
return;
//handle URL ends with "#"
int i = href.indexOf('#');
if ( i!=-1 )
href = href.substring(0,i);
//handle email adress
if ( href.toLowerCase().startsWith("mailto:") ) {
report.spiderFoundEMail(href);
return;
}
if( followEnable ) {
handleLink(base,href);
}
else
{
log("Meta tag shows following is NOT allowed:"+base.toString());
}
}
public void handleStartTag(HTML.Tag t,
MutableAttributeSet a,int pos)
{
handleSimpleTag(t,a,pos);
}
//handle relative URL
protected void handleLink(URL base,String str)
{
try {
URL url;
if(str.startsWith("http://") ){
url = new URL(str);
}
else if(str.startsWith("www.")){
url = new URL("http://"+str);
}
else{
url = new URL(base,str);
}
if ( report.spiderFoundURL(base,url) )
addURL(url);
} catch ( MalformedURLException e ){
log("Found malformed URL: " + str );
}
}
}
/*** Connection Timer***/
/** * HTML parser*@Author Kelven.JU **/
protected class Parser2
extends HTMLEditorKit.ParserCallback {
protected URL base;
private File tmpFilePath;
//file to output the textual content
protected BufferedWriter fileOut2 ;
public Parser2(URL base)
{
this.base = base;
try{
tmpFilePath=new File("train");
tmpFilePath.mkdir();
fileOut2=new BufferedWriter ( new FileWriter ( "train/"+base.toString().replace(":","").replace("/","").replace(".","")+".train" ));
}catch(IOException e){
System.out.println("Construtor Parser2 [Spider.java 715] IO error!");
}
}
//process textual content
public void handleText(char[] text, int position){
//log("Outputing texts into file : "+base.toString().replace(":","").replace("/","").replace(".","")+".train");
try{
fileOut2.write(text);
fileOut2.newLine();
fileOut2.flush();
//fileOut.close();
}catch (IOException e)
{
System.out.println("Function handleText [Spider.java 586] IO error!");
}
}
public void handleSimpleTag(HTML.Tag t,
MutableAttributeSet a,int pos)
{
}
public void handleStartTag(HTML.Tag t,
MutableAttributeSet a,int pos)
{
}
//handle relative URL
protected void handleLink(URL base,String str)
{
}
}
//log events
public void log(String entry)
{
String logMessage = (new Date()) + ":" + entry;
System.out.println( logMessage );
try{
fileOut.newLine();
fileOut.write(logMessage,0,logMessage.length());
//fileOut.newLine();
fileOut.flush();
//fileOut.close();
}catch (IOException e){
System.out.println("Function log [Spider.java 507] IO error!");}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -