⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 getsoupu028shop.java

📁 对成都商铺网的信息进行采集的一个程序
💻 JAVA
字号:


import java.io.*;
import java.net.*;
import java.sql.*;
import java.util.*;


/**
 * http://www.soupu028.com/search.asp?PageNo=1&xxlb=商铺门面&qy=&jzmj=&zjjg=&xxbt=&bh=####
 * @author Administrator
 *
 */
public class GetSoupu028Shop {
    Connection con=null;
	Statement st = null;
	CallableStatement cs=null;
	ResultSet rs=null;
	/**
	 * @param args 程序入口
	 */
	public static void main(String[] args) {

	}
	
	
	/**
	 *通过数据库的URL来获取详细信息,并存入数据库 
	 * 
	 */
	public void saveSoupu028Shop(){
		try {
			Vector vec = querySoupu028Url();
			for (int i = 0; i < vec.size(); i++) {
				String url = vec.get(i).toString();
				int j = Integer.parseInt(url.substring(36, 41));
				this.saveSoupu028Info(url,j);
				System.out.println(url);
				System.out.println("--------------------------------------------");
				updateUrlGets(url,"{call update_soupu028Gets(?)}","soupu028Url");
			}
		} catch (SQLException e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 通过传入的URL获取成都商铺信息网的商铺详细信息,并存入数据库
	 */
	public void saveSoupu028Info(String url,int j){
		String html = getHTML(url);
		//标题,转让方式,城市,区域,具体位置,联系人名字,电话,邮件,详细信息,面积中文,类型
		String title="",transWay="",city="成都",region="",place="",name="",number="",email="",details="",areaText="",type="";
		int area = 0;
        String opStr = html.substring(html.indexOf("<TBODY>"),html.indexOf("</TBODY>"));
        //取标题
        int titleStart = opStr.indexOf("#0066FF\" >");
        int titleEnd = opStr.indexOf("</font>");
        title = opStr.substring(titleStart+10, titleEnd);
       
        //取转让方式
        String twStr = opStr.substring(opStr.indexOf("交易性质"), opStr.length());
	    int twStart = twStr.indexOf("#0066FF\" >");
	    int twEnd = twStr.indexOf("</font>");
	    transWay = twStr.substring(twStart+10, twEnd);
	   
	    //取面积
	    String areaStr = opStr.substring(opStr.indexOf("建筑面积"), opStr.length());
	    int areaStart = areaStr.indexOf("#0066FF\" >");
	    int areaEnd = areaStr.indexOf("</font>");
	    areaText = areaStr.substring(areaStart+10, areaEnd);
	    for (int i = 0; i < areaText.length(); i++) {
			if(areaText.charAt(i)<48 ||areaText.charAt(i)>57){
				areaText = areaText.replaceAll(areaText.substring(i,areaText.length()), "");
			}
		}
	    if(areaText!=""&&areaText.length()>0){
		    area = Integer.parseInt(areaText);
	    }
	    
	    //取区域
	    String regionStr = opStr.substring(opStr.indexOf("所属区县"), opStr.length());
	    int regionStart = regionStr.indexOf("#0066FF\" >");
	    int regionEnd = regionStr.indexOf("</font>");
	    region = regionStr.substring(regionStart+10, regionEnd);
	    
	    //取所在位置 
	    String placeStr = opStr.substring(opStr.indexOf("所在位置"), opStr.length());
	    int placeStart = placeStr.indexOf("#0066FF\" >");
	    int placeEnd = placeStr.indexOf("</font>");
	    place = placeStr.substring(placeStart+10, placeEnd);
	  
	    //取联系人姓名
	    String nameStr = opStr.substring(opStr.indexOf("#993300>")+20, opStr.length());
	    int nameStart = nameStr.indexOf("#993300>");
	    int nameEnd = nameStr.indexOf("</font>");
	    name = nameStr.substring(nameStart+8, nameEnd);
	    
	    //取联系人电话
	    String numberStr = opStr.substring(opStr.indexOf("width=\"181\">"), opStr.length());
	    int numberStart = numberStr.indexOf("#993300 >");
	    int numberEnd = numberStr.indexOf("</font>");
	    int number2Start = numberStr.lastIndexOf("#993300 >");
	    int number2End = numberStr.substring(number2Start, numberStr.length()).indexOf("</font>");
	    String number2Str =  numberStr.substring(number2Start, numberStr.length());
	    number = numberStr.substring(numberStart+9, numberEnd)+" "+number2Str.substring(9, number2End);
	    
	    //取联系人EMAIL
	    String emailStr = opStr.substring(opStr.indexOf("#993300>邮")+50, opStr.length());
	    int emailStart = emailStr.indexOf("#993300>");
	    int emailEnd = emailStr.indexOf("</font>");
	    email = emailStr.substring(emailStart+8, emailEnd);
	    
	    //取详细内容
	    String detailsStr = opStr.substring(opStr.indexOf("color=\"#0066FF\"  >&nbsp;"), opStr.length());
	    int detailsStart = detailsStr.lastIndexOf("color=\"#0066FF\" >&nbsp;");
	    int detailsEnd = detailsStr.indexOf("</font>");
	    int details2End = detailsStr.lastIndexOf("</font>");
	    details = detailsStr.substring(24, detailsEnd);
	    if(details.indexOf("<br>")>-1||details.indexOf("&nbsp;")>-1){
	    	details = details.replaceAll("<br>", "");
	    	details = details.replaceAll("&nbsp;", "");
	    }
	    String details2 = detailsStr.substring(detailsStart+23, details2End);
	    if(details2.indexOf("<br>")>-1||details2.indexOf("&nbsp;")>-1){
	    	details2 = details2.replaceAll("<br>", "");
	    	details2 = details2.replaceAll("&nbsp;", "");
	    }
		
		String	sql = "{call add_soupu028(?,?,?,?,?,?,?,?,?,?,?,?)}";
	    try {
			Vector nameVec = querySoupu028ShopName();
			if(!nameVec.contains(title)){
				if(title.length()>0&&number.length()>0&&(details.length()+details2.length())>0){
					System.out.println("-------------------开始---------------------");
					System.out.println("信息主题:"+title.trim());
					System.out.println("--------------------------------------------");
					System.out.println("转让方式:"+transWay.trim());
					System.out.println("--------------------------------------------");
					System.out.println("建筑面积:"+area);
					System.out.println("--------------------------------------------");
					System.out.println("所属区县:"+region.trim());
					System.out.println("--------------------------------------------");
					System.out.println("所在位置:"+place.trim());
					System.out.println("--------------------------------------------");
					System.out.println("联系人:"+name.trim());
					System.out.println("--------------------------------------------");
					System.out.println("电话:"+number.trim());
					System.out.println("--------------------------------------------");
					System.out.println("联系人邮件:"+email.trim());
					System.out.println("--------------------------------------------");
					System.out.println("详细内容:"+details.trim()+details2.trim());
					System.out.println("--------------------结束--------------------");
					savePumian(city, region, getType(title,title), title, number, email, details+details2, j, area, place,transWay,j,sql);
				}
			}
		} catch (SQLException e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 判断信息的类型
	 */
	public static String getType(String name,String range){
		String type = "";
		if(name.indexOf("酒吧")>-1||name.indexOf("水吧")>-1)
		{type="酒吧水吧";}
		else if(name.indexOf("餐")>-1)
		{type="中西餐馆";}
		else if(name.indexOf("衣服")>-1||name.indexOf("饰品")>-1||name.indexOf("服装")>-1)
		{type="服装饰品";}
		else if(name.indexOf("面馆")>-1||name.indexOf("小吃")>-1||name.indexOf("快餐")>-1)
		{type="快餐排档";}
		else if(name.indexOf("茶")>-1||name.indexOf("茶楼")>-1||name.indexOf("茶坊")>-1)
		{type="茶楼茶坊";}
		else if(name.indexOf("干洗")>-1||name.indexOf("水洗")>-1)
		{type="干洗水洗";}
		else if(name.indexOf("火锅")>-1||name.indexOf("汤锅")>-1)
		{type="火锅汤锅";}							
		else if((name.indexOf("美容")>-1||name.indexOf("美发")>-1))
		{type="美容美发";}
		else if(name.indexOf("影")>-1||name.indexOf("书")>-1)
		{type="音像书刊";}
		else if(name.indexOf("网吧")>-1||name.indexOf("游戏")>-1)
		{type="网吧游戏";}
		else if(name.indexOf("桑拿")>-1||name.indexOf("农家乐")>-1||name.indexOf("浴足")>-1||name.indexOf("瑜伽")>-1||name.indexOf("娱乐")>-1)
		{type="休闲娱乐";}
		else if(name.indexOf("写字楼")>-1||name.indexOf("办公室")>-1)
		{type="写字间";}
		else if(name.indexOf("地产代理")>-1||name.indexOf("中介")>-1)
		{type="商业合作";}
		else if(name.indexOf("厂房")>-1||name.indexOf("仓库")>-1||name.indexOf("土地")>-1)
		{type="厂房库房";}
		else
		{
			if(range.indexOf("酒吧")>-1||range.indexOf("水吧")>-1)
			{type="酒吧水吧";}
			else if(range.indexOf("餐")>-1)
			{type="中西餐馆";}
			else if(range.indexOf("衣服")>-1||range.indexOf("饰品")>-1||range.indexOf("服装")>-1)
			{type="服装饰品";}
			else if(range.indexOf("面馆")>-1||range.indexOf("小吃")>-1||range.indexOf("快餐")>-1)
			{type="快餐排档";}
			else if(range.indexOf("茶")>-1||range.indexOf("茶楼")>-1||range.indexOf("茶坊")>-1)
			{type="茶楼茶坊";}
			else if(range.indexOf("干洗")>-1||range.indexOf("水洗")>-1)
			{type="干洗水洗";}
			else if(range.indexOf("火锅")>-1||range.indexOf("汤锅")>-1)
			{type="火锅汤锅";}							
			else if((range.indexOf("美容")>-1||range.indexOf("美发")>-1))
			{type="美容美发";}
			else if(range.indexOf("影")>-1||range.indexOf("书")>-1)
			{type="音像书刊";}
			else if(range.indexOf("网吧")>-1||range.indexOf("游戏")>-1)
			{type="网吧游戏";}
			else if(range.indexOf("桑拿")>-1||range.indexOf("农家乐")>-1||range.indexOf("浴足")>-1||range.indexOf("瑜伽")>-1||range.indexOf("娱乐")>-1)
			{type="休闲娱乐";}
			else if(range.indexOf("写字楼")>-1||range.indexOf("办公室")>-1)
			{type="写字间";}
			else if(range.indexOf("地产代理")>-1||range.indexOf("中介")>-1)
			{type="商业合作";}
			else if(range.indexOf("厂房")>-1||range.indexOf("仓库")>-1||range.indexOf("土地")>-1)
			{type="厂房库房";}
			else
			{type="商铺租售";}
		}
		return type;
	}
	
	/**
	 * 获取整个页面在HTML代码
	 */
	public static String getHTML(String url){
		String line;
		String outString = "";
		String returnStr = ""; 
		try {
			URL Url = new URL(null, url,new sun.net.www.protocol.http.Handler());
			HttpURLConnection connection =	(HttpURLConnection)Url.openConnection();
			BufferedReader	in = new BufferedReader(new	InputStreamReader(connection.getInputStream())); 
			while ( (line = in.readLine())	!= null){
				outString += line.trim();
			 }
			returnStr = outString;
			outString	= null;
			in.close();	
		} catch (Exception e) {
			try{
				URL Url = new URL(null, url,new sun.net.www.protocol.http.Handler());
				HttpURLConnection connection =	(HttpURLConnection)Url.openConnection();
				BufferedReader	in = new BufferedReader(new	InputStreamReader(connection.getInputStream())); 
				while ( (line = in.readLine())	!= null){
					outString += line;
				 }
				returnStr = outString;
				outString	= null;
				in.close();
			}catch (Exception ex) {
				ex.printStackTrace();
			}
		}	
		return returnStr;
	}

	/**
	 * 查询soupu028URL
	 * @return
	 * @throws SQLException
	 */
	public Vector querySoupu028Url() {
		Vector urlVec = new Vector();
		try
		{
			con = getConnection();
			String sql = "select id,url from soupu028Url where gets!=1 order by id";
			st = con.createStatement();
			rs = st.executeQuery(sql);
			while(rs.next()){
				urlVec.add(rs.getString(2));
			}
		}
		catch (Exception e)
		{
			System.out.println(e.getMessage());
		}
	
		return urlVec;
	}
    /**
	 * 查询soupu028
	 * @return
	 * @throws SQLException
	 */
	public Vector querySoupu028ShopName() {
		Vector urlVec = new Vector();
		try
		{
			con = getConnection();
			String sql = "select name from soupu028";
			st = con.createStatement();
			rs = st.executeQuery(sql);
			while(rs.next()){
				urlVec.add(rs.getString(2));
			}
		}
		catch (Exception e)
		{
			System.out.println(e.getMessage());
		}
	
		return urlVec;
	}



    /**
	 * 更新GETS
	 * @param url
	 * @throws SQLException
	 */
	public void updateUrlGets(String url,String update,String table) throws SQLException{
		con = getConnection();
		String idUrl = "select id from "+table+" where url='"+url+"'";
		st = con.createStatement();
		rs = st.executeQuery(idUrl);
		cs = con.prepareCall(update);
		if(rs.next()){
			cs.setInt(1, rs.getInt(1));
			System.out.println("ID======"+rs.getInt(1));
		}
		cs.executeUpdate();
	}

	/**
	 * 保存成都商铺面信息
	 * @param city
	 * @param region
	 * @param type
	 * @param name
	 * @param number
	 * @param email
	 * @param details
	 * @param t4g
	 * @param area
	 * @param transWay
	 * @param place
	 * @param i
	 * @throws SQLException
	 */
	public void savePumian(String city,String region,String type,String name,String number,String email,
      			            String details,int t4g,int area,String transWay,String place,int i,String sql) throws SQLException{
		con = getConnection();
		cs = con.prepareCall(sql);
		cs.setString(1,	city);
		cs.setString(2,	region);//region
		cs.setString(3,	type);//自己判断
		cs.setString(4,	name);
		cs.setString(5,	number);//number
		cs.setString(6,	email);
		cs.setString(7,	details);//details
		cs.setInt(8,t4g);//url表中的,0表示没扫描过的,1表示扫描过了
		cs.setInt(9,area);//area
		cs.setString(10,transWay);//transWay
		cs.setString(11,place);
		cs.setInt(12, i);
		cs.executeUpdate();
	}

	/**
	 * 获取连接
	 * @return
	 */
	public Connection getConnection(){

		String sDBDriver = "com.microsoft.jdbc.sqlserver.SQLServerDriver";
		String DBCon = "jdbc:microsoft:sqlserver://192.168.1.18:1433;DatabaseName=balan";
		try	{
			Class.forName(sDBDriver);
			con	= DriverManager.getConnection(DBCon,"sa","jpw2002");
		}catch (Exception e) {
			System.err.println("Unable to load driver:"	+ e.getMessage());
		}
		return con;
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -