📄 pageoperate.java

📁 本程序可从网上利用百度搜索引擎下载和输入关键词有关的网页
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页



package spider;

import spider.SaveToDataBase;
import spider.photoOperate;

import java.net.*;
import java.io.*;
import java.util.*;

public class PageOperate
{
	private String myUrl;// 定义URL
	private String myHtml;// 定义从URL站点得到的Html字符串
	private String myAddress;// 定义保存Html字符串的文件的地址
	private int myFileNumber;// 定义保存的文件的个数
	private String myFileName;// 定义保存Html字符串的文件名
	private List myHttpList = new ArrayList();//保存从html字符串中提取的url

	public void setMyUrl(String myUrl)// 设置myUrl要连接的url地址
	{
		this.myUrl = myUrl;
	}

	public void setMyUrl(String baidu,String searchWord)// 设置搜索时的前缀和搜索的关键词
	{
		this.myUrl =baidu+java.net.URLEncoder.encode(searchWord);
	}
	
	public String getMyUrl()// 取得myUrl
	{
		return myUrl;  
	}

	public void setHtml()// 从指定的网址url取得html字符流设置myHtml //此处用了上面定义的私有变量myUrl
	{
		if(myUrl!=null)
		{
			try 
			{
				URL ul = new URL(myUrl);// 此处用了上面定义的私有变量myUrl				
				BufferedReader br = new BufferedReader(new InputStreamReader(ul.openStream())); // 打开字符流
				System.out.println("连接完成");
				String s = "";
				StringBuffer sb = new StringBuffer("");
				while ((s = br.readLine()) != null) 
				{
					sb.append(s + "\r\n");
			    }
				br.close();
				myHtml = sb.toString();//.toLowerCase();// 网页字符流付值给myHtml//转换成小写的字母
				this.charSet();//转换编码
				System.out.println("读取完成");
			} 
			catch (Exception e)
			{
				myHtml = null;
				System.out.println("error open url   " + myUrl+ "  and HTML is null");
				e.printStackTrace();
			}
		}
		else
		{
			System.out.println("myUrl为空");
		}
		
	}

	public String getHtml()// 取得myHtml
	{
		return myHtml;
	}

	public void setMyAddress(String oneAddress)// 设置保存文件的地址
	{
		myAddress = oneAddress;
	}

	public String getMyAddress()// 取得保存文件的地址
	{
		return myAddress;
	}

	public void setMyFileNumber(int oneFileNumber)// 设置文件数
	{
		myFileNumber = oneFileNumber;
	}

	public int getMyFileNumber()// 取得文件数
	{
		return myFileNumber;
	}

	public void setFileName(String myFileName)// 设置文件名 //此处用了上面定义的私有变量myHtml和myUrl
	{
		this.myFileName=myFileName;
	}
	public void setFileName()// 设置文件名 //此处用了上面定义的私有变量myHtml和myUrl
	{
		try// 用页面的title当存储地址
		{
			if(myHtml!=null)
			{
				//System.out.println(myHtml);
				myFileName = myHtml.substring(myHtml.indexOf("<title") + 7, myHtml
						.indexOf("</title>"));// 提取标题的内容
				String replaceUrl = myFileName.replace('.', '_').replace(':', '_')
						.replace('/', '_').replace('/', '_').replace('?', '_')
						.replace('=', '_').replace('|', '_').replace('&', '_').replace(' ', '_').replace('，', '_').replace('《', '_').replace('》', '_').replace('"', '_').replace('>', '_').replace('-', '_').replace(',', ' ').replaceAll(","," ");				
				myFileName = replaceUrl+ myFileNumber ;//+ ".html";// 保存的文件最终名称
				System.out.println("文件名myFileName:"+myFileName);
			}
			else
			{
				System.out.println("myHtml是空的");
			}		
		} 
		catch (Exception e)// 如果页面的title不能用，则用域名做地址
		{
			//System.out.println(e.toString());
			System.out.println("出现异常myFileName:"+myFileName);				
		}
		
		if(myFileName==null)
		{
			if(myUrl!=null)
			{
				System.out.println("error title  filename but use the www");
				 
				String replaceUrl = myUrl.replace('.', '_').replace(':', '_')
						.replace('/', '_').replace('/', '_').replace('?', '_')
						.replace('=', '_').replace('|', '_').replace('&', '_');														              
				myFileName = replaceUrl ;//+ ".html";

			}
			else
			{
				System.out.println("myUrl是空的");
			}
		}
		
		if(myUrl.indexOf(".asp")!=-1)
		{
			myFileName=myFileName+".asp";
		}
		else
		{
			if(myUrl.indexOf(".jsp")!=-1)
			{
				myFileName=myFileName+".jsp";
			}
			else
			{
				if(myUrl.indexOf(".htm")!=-1)
				{
					myFileName=myFileName + ".htm";
				}
				else
				{						
					myFileName=myFileName.trim()+".html";	
					System.out.println(myFileName);
				}
			}					
		}					
	}

	public String getFileName()// 取得文件名
	{
		return myFileName;
	}

	public void saveHtmlToFile()// 把myHtml保存成指定的文件 //用到了上面定义的myFileName 和// myAddress								
	{
		if (myHtml != null && myAddress!=null && myFileName!=null )
		{
			try
			{
				
				File dir = new File(myAddress );//目录是否存在
				if(!dir.exists())//不存在则创建，可创建他的父目录
				   {
					   dir.mkdirs();
				   }								
				 File write = new File(dir,myFileName); // 用到了上面定义的myFileName																// 和 myAddress
				 if(!write.exists())
				   {
					   write.createNewFile();
				   }
				   //System.out.println(myFileName);
				   BufferedWriter bw = new BufferedWriter(new FileWriter(write,	true));
				   String[] someHm = myHtml.split("\n");
				   for (int i = 0; i < someHm.length; i++)
				   {
					 bw.write(someHm[i]);
					 bw.newLine();
				   }

				  bw.close();
				  System.out.println("保存完毕");
			}
			catch (Exception e) 
			{
				e.printStackTrace();
				System.out.println("出现异常 error save html to file");
			}

		}
		else
		{
			System.out.println("myHtml is null");
		}

	}

	public String http(int beginIndex, int endIndex, String spiderHtml)//从spiderHtml中取出从beginIndex到endIndex的一段字符串
	{
		String oneHttp = " ";
		try 
		{
			oneHttp = spiderHtml.substring(beginIndex, endIndex);
		} 
		catch (Exception e) 
		{
			System.out.println("此处有异常" + spiderHtml);
		}
		return oneHttp;
	}


	
	//首先处理切分后没有http://的情况（首先判断第一个字符是否是单双引号，接着派段下面是否是http://）。再同统一处理有http://的情况
	public void setHttpList(String splitWord)// 从httl页面中提取url存放到LIst中，用特定的词splitWord分割网页
	{
		if (myHtml != null) 
		{							
			String[] splitHtmlUsehref = myHtml.split(splitWord); // href=切割// 用到了上面定义的myHtml	

			System.out.println(splitHtmlUsehref.length);
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -