📄 pageoperate.java

📁 本程序可从网上利用百度搜索引擎下载和输入关键词有关的网页
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
			for (int shuh = 1; shuh < splitHtmlUsehref.length; shuh++)
			{				
				String oneHttp=splitHtmlUsehref[shuh];
	        	int beginIndex=0;
	        	int endIndex=0;
	        	int max=200;
	        	
	        	//去掉第一个不是正常字母的字符
	        	while(oneHttp.startsWith("\"", 0) || oneHttp.startsWith("'", 0)|| oneHttp.startsWith("/", 0) )//第一个字符是双引号开始 或着单引号
	        	{
	        		//System.out.println("-------"+splitHtmlUsehref[shuh]);
	        		beginIndex=1;
	        		endIndex=(splitHtmlUsehref[shuh].length()-1);
	        		oneHttp=http(beginIndex, endIndex, oneHttp);//先从空格处断开
	        		//System.out.println("-------oneHttp:"+oneHttp);
	        	}
	       	        	
	        	if(oneHttp.startsWith("http://", 0))
	        			{
	        		    String oldHtml=oneHttp;
	        		    oneHttp=null;
			            beginIndex=0;
			        	endIndex=oldHtml.indexOf(' ');			        	
			        	if(endIndex!=-1 && endIndex<max)//有空格
			        	{         		
			        		oneHttp=http(beginIndex, endIndex, oldHtml);//从空格处断开       					        		
			        		endIndex=oneHttp.indexOf('"');                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  		endIndex=oneHttp.indexOf('"');
			            	if(endIndex!=-1 && endIndex<max)//有双引号
			            	{
			        		    oneHttp=http(beginIndex, endIndex, oneHttp);//从双引号处断开
			            	}
			        		
			            	endIndex=oneHttp.indexOf(')');
			            	if(endIndex!=-1 && endIndex<max)//有)号
			            	{
			        		    oneHttp=http(beginIndex, endIndex, oneHttp);//从）号处断开
			            	}
			            	
			            	endIndex=oneHttp.indexOf('>');
			            	if(endIndex!=-1 && endIndex<max)//有)号
			            	{
			        		    oneHttp=http(beginIndex, endIndex, oneHttp);//从）号处断开
			            	}
			            	
			            	endIndex=oneHttp.indexOf("'");
			            	if(endIndex!=-1 && endIndex<max)//有'号
			            	{
			        		    oneHttp=http(beginIndex, endIndex, oneHttp);//从）号处断开
			            	}			   			            	
			        	}
			        	else //没有空格
			        	{			
			        		//System.out.println("没有空格或字符串>200");
			        	}
			        	if(oneHttp!=null)
			        	{
			        		myHttpList.add(oneHttp);
			        		//System.out.println("http://"+oneHttp);
			        	}
	        			}        					        		        		        		        				
			}//end for				
		}//end if
	}//end  public 

	public List getMyListHttp()// 返回一个List，其中保存了被分割的片段
	{
		return myHttpList;
	}

	
	public boolean hasForm()
	{
		if(myHtml!=null)
		{
			  String[] chechSearchHtml=myHtml.split("<form");//检测是否有查询接口
		      int splitChechSearchHtml=chechSearchHtml.length;
		      //System.out.println("用form分割myHtml后有:"+splitChechSearchHtml+"段");
		      if(splitChechSearchHtml>=2)//有表单
			   {
		           return true;
		       }
		      else
		      {
		    	  //System.out.println("没有form表单:");
		    	  return false;
		      }			
		}
		else
		{
			System.out.println("myHtml是空的");
			return false;
		}
	}
	//分析一个baidu结果网页，从中找下一页的url
	public String nextPageUrl(String httpPrefix,String splitWord,String nextPageWord)
	{
		String url="";
		if (myHtml != null) 
		{							
			String[] splitHtmlUsehref = myHtml.split(splitWord); // href=切割// 用到了上面定义的myHtml	

			for (int shuh = 1; shuh < splitHtmlUsehref.length; shuh++)
			{
				if(splitHtmlUsehref[shuh].indexOf("下一页")!=-1)//是否包含下一页
				{
					String oneHttp=splitHtmlUsehref[shuh];
		        	int beginIndex=0;
		        	int endIndex=0;
		        	int max=200;
		        	
		        	//去掉第一个不是正常字母的字符
		        	while(oneHttp.startsWith("\"", 0) || oneHttp.startsWith("'", 0)|| oneHttp.startsWith("/", 0) )//第一个字符是双引号开始 或着单引号
		        	{
		        		//System.out.println("-------"+splitHtmlUsehref[shuh]);
		        		beginIndex=1;
		        		endIndex=(splitHtmlUsehref[shuh].length()-1);
		        		oneHttp=http(beginIndex, endIndex, oneHttp);//先从空格处断开
		        		//System.out.println("-------oneHttp:"+oneHttp);
		        	}
		        	
		        	//首先提取没有http://的代码
		        	if(!oneHttp.startsWith("http://",0 ))//没有http://
					{									
						beginIndex=0;					
			        	endIndex=splitHtmlUsehref[shuh].indexOf(' ');			        	
			        	if(endIndex!=-1 && endIndex<max)//有空格
			        	{         			        		
			        		oneHttp=http(beginIndex, endIndex, oneHttp);//先从空格处断开  
			        		beginIndex=0;//已经把第一个字符去掉了，所以必须的从0开始
			        		endIndex=oneHttp.indexOf('"', 1);                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  		endIndex=oneHttp.indexOf('"');
			            	if(endIndex!=-1 && endIndex<max)//有双引号
			            	{
			        		    oneHttp=http(beginIndex, endIndex, oneHttp);//从双引号处断开
			            	}
			        		
			            	endIndex=oneHttp.indexOf(')');
			            	if(endIndex!=-1 && endIndex<max)//有)号
			            	{
			        		    oneHttp=http(beginIndex, endIndex, oneHttp);//从）号处断开
			            	}
			            	
			            	endIndex=oneHttp.indexOf('>');
			            	if(endIndex!=-1 && endIndex<max)//有)号
			            	{
			        		    oneHttp=http(beginIndex, endIndex, oneHttp);//从）号处断开
			            	}
			            	
			            	endIndex=oneHttp.indexOf("'",1);
			            	if(endIndex!=-1 && endIndex<max)//有'号
			            	{
			        		    oneHttp=http(beginIndex, endIndex, oneHttp);//从）号处断开
			            	}			   			            	
			        	}//end 有空格
			        	else //没有空格
			        	{			
			        		System.out.println("没有空格或字符串>200");
			        	}
			        	if(oneHttp!=null)
			        	{
			        		url=httpPrefix+oneHttp;
			        		System.out.println(url);//他的前缀是http://www.baidu.com/
			        	}						
					}
				}//end 是否包含下一页
			}
		}
		return url;
	}
	
	
	public boolean hasTextAndSearch()
	{
		if (myHtml != null) 
		{
			String[] text1=myHtml.split("type=text");
      		String[] text2=myHtml.split("type=\"text\"");      		
      		String[] text3=myHtml.split("type=\'text\'");
      		String[] text4=myHtml.split("class=\"textbox\"");
      		String[] text5=myHtml.split("class=textbox");
      		String[] text6=myHtml.split("class=\'textbox\'");
      		int text=text1.length+text2.length+text3.length+text4.length+text5.length+text6.length;
      		if(text>6 && myHtml.indexOf("search")!=-1 )
      		{
      			System.out.println("网页中有文本框和搜索字样");
      			return true;
      		}
      		else
      		{
          		System.out.println("网页中没有文本框和搜索字样");		          		
      		}
		}
		
		return false;
	}
	
	public void charSet()
	{
		if(myHtml!=null)
		{
			  try
			  {
			      if(myHtml.indexOf("charset=utf-8")!=-1 || myHtml.indexOf("charset='utf-8'")!=-1 || myHtml.indexOf("charset=\"utf-8\"")!=-1)
			       {
				      myHtml=new String(myHtml.getBytes("GBK"),"utf-8");
			       }
			      else if(myHtml.indexOf("charset=iso8859_1")!=-1 || myHtml.indexOf("charset='iso8859_1'")!=-1 || myHtml.indexOf("charset=\"iso8859_1\"")!=-1)
			      {
			    	  myHtml=new String(myHtml.getBytes("GBK"),"iso8859_1");
			      }
			      else
			      {
			    	  System.out.println("字符不需要转换");			    	  
			      }							    
			  }
			  catch(Exception e)
			  {
				  System.out.println("字符转换出现异常");
				  e.printStackTrace();
			  }			  
		  }
						
		}
	
					   
	 
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -