📄 crawler.java
字号:
import java.io.*;
import java.util.*;
/*
* IKT-502 Information Technology Seminar
*
* Learning Automata based crawler resource allocation
*
* Explanation:
*
* Folders: /new - files get copied here when they're crawled
* /old - contains files for checking with the isUpdated() function
*
* If you use harvestman (default) change the path in the crawl() function
*
*
*/
public class crawler {
public ArrayList<page> page;
public ArrayList<page> rewardPage;
public ArrayList<page> punishPage;
public int capacity;
public int capacityUsed;
public runtime runtime;
public crawler() {
page = new ArrayList<page>();
rewardPage = new ArrayList<page>();
punishPage = new ArrayList<page>();
// for running command line
runtime = new runtime();
runtime.run("cmd /C mkdir new");
runtime.run("cmd /C mkdir old");
capacityUsed = 0;
// Set webcrawling capacity
capacity = 2;
// Read list of pages to crawl from file
File file = new File("pages.txt");
try {
BufferedReader br = new BufferedReader(new FileReader(file));
while(br.ready())
{
page.add(new page(br.readLine()));
}
}
catch(Exception e)
{
e.printStackTrace();
}
}
public void crawl() {
// Clear pages and capacity used
rewardPage.clear();
punishPage.clear();
capacityUsed = 0;
for(int i=0;i<page.size();i++)
{
Random random = new Random();
if(random.nextFloat() < page.get(i).getState())
{
/*
* Use your crawler of choice here
* if you use harvestman: Remember to change the path
*/
runtime.run("cmd /C C:\\HarvestMan1\\harvestman.exe -N "+page.get(i).getUrl());
runtime.run("cmd /C mkdir new\\"+page.get(i).getDir());
runtime.run("cmd /C move " + page.get(i).getFileName() + " new\\"+page.get(i).getDir());
// end of crawling
if(page.get(i).isUpdated())
{
rewardPage.add(page.get(i));
}
else
{
punishPage.add(page.get(i));
}
capacityUsed++;
}
}
if(capacityUsed > capacity)
{
for(int i=0;i<punishPage.size();i++)
{
punishPage.get(i).punish();
System.out.println("Punish - " + punishPage.get(i).getDir() + " - ("+punishPage.get(i).getState() + ")");
}
}
else
{
for(int i=0;i<rewardPage.size();i++)
{
rewardPage.get(i).reward();
System.out.println("Reward - " + rewardPage.get(i).getDir() + " - ("+rewardPage.get(i).getState() + ")");
}
}
System.out.println("Capacity used:" + capacityUsed + "/" + capacity);
}
public static void main(String[] args) {
crawler Crawler = new crawler();
for(int i=0;i<3;i++)
{
Crawler.crawl();
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -