⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 writer.java

📁 使用Eclipse编写的java的网络图片爬虫
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package gather;

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;

public class Writer
{
	public final int	FL_50K			= 50 * 1024;
	public final int	FL_100K			= 100 * 1024;
	public final int	FL_500K			= 500 * 1024;
	public final int	FL_1M			= 1024 * 1024;
	public final int	FL_5M			= 5 * 1024 * 1024;

	int					rlength			= 0;
	int					nlength			= 0;
	private int			nUrlPath		= 0;
	private String[]	urlPath			= new String[100];

	static long			nSaveFileCount	= 0;

	private final int	nSLEEPTIME		= 1 * 60000;
	byte[]				buf				= new byte[10240];
	byte[]				inbuf			= new byte[10240];
	byte[]				tembuf			= new byte[10240];

	public Writer()
	{

	}

	public void memcpy(byte[] src, int srcpos, byte[] dest, int destpos, int length)
	{
		for (int i = 0; i < length; i++)
		{
			dest[destpos + i] = src[srcpos + i];
		}
	}

	public String findCharset(byte[] buf)
	{
		if (buf == null || buf.length <= 0)
			return "";
		String strcharset = "";
		String addEr = "  将使用默认的charset。";
		String strtem;
		final String strCHARSETMARK = "charset=";
		int n = 0;
		int pos = 0;
		int i = 0;
		try
		{
			strtem = new String(buf);
			strtem = strtem.toLowerCase();
			n = strtem.indexOf(strCHARSETMARK);
			if (n > -1)
			{
				pos = n + strCHARSETMARK.length();
				for (i = pos; (i < pos + 30) && (i < buf.length); i++)
				{
					if ((strtem.charAt(i) == '"') || (strtem.charAt(i) == '\'') || (strtem.charAt(i) == '/') || (strtem.charAt(i) == '>') || (strtem.charAt(i) == 0x20) || (strtem.charAt(i) == 0x0d) || (strtem.charAt(i) == 0x0a) || (strtem.charAt(i) == ',') || (strtem.charAt(i) == ';'))
					{
						break;
					}
				}
				strcharset = strtem.substring(pos, i);
			}
			Charset cs = Charset.forName(strcharset);
		}
		catch (UnsupportedCharsetException e)
		{
			strcharset = "";
		}
		catch (IllegalCharsetNameException e)
		{
			strcharset = "";
		}
		catch (IllegalArgumentException e)
		{
			strcharset = "";
		}
		catch (Exception e)
		{
			strcharset = "";
		}
		return strcharset;
	}

	public void savetofile(String strPath, String strFileExt, byte[] buf)
	{//说明:strFileExt是指定保存到文件的后缀名,从链接的后三个字符读出,不带“.”
		try
		{
			new File(strPath).mkdirs();
			String strfile = "";
			strfile = strfile.format(strPath + "%08X." + strFileExt, nSaveFileCount++);
			FileOutputStream fos = new FileOutputStream(strfile);
			fos.write(buf);
			fos.close();
		}
		catch (Exception e)
		{
			e.printStackTrace();
		}
	}

	public int writeData(URLConnection con, String strurl, boolean bSaveToFile) throws Exception
	{
		int n = 0;
		String strhtml = "";
		String strcharset = "";
		String strHost = "noname";
		String strWritePath = "D:\\temp\\";
		String strSubPath = "";
		try
		{
			int len = 0;
			nlength = 0;

			nUrlPath = parseURLPath(strurl);//解析BaseURL
			strHost = new URL(strurl).getHost();
			if (strHost == null || strHost.length() == 0)
				strHost = "noname";
			strWritePath = "D:\\temp\\" + strHost + "\\";
			// ------------------------------------------------------------
			HttpURLConnection httpurlconn;
			httpurlconn = (HttpURLConnection) con;
			int nnnn = 0;
			String strHeaderField = "";
			if (!bSaveToFile)
			{//如果不写入文件,则读取头数据,并寻找charset:
				while (strHeaderField != null)
				{
					strHeaderField = httpurlconn.getHeaderField(nnnn++);

					if (strHeaderField != null)
					{
						if (strHeaderField.toLowerCase().contains("charset="))
						{
							strcharset = findCharset(strHeaderField.getBytes());// 在HTTP头数据中寻找charset
							if (!strcharset.isEmpty())
								break;
						}
					}
				}
			}
			// ------------------------------------------------------------
			// ------------------------------------------------------------
			InputStream br = con.getInputStream();// 获取数据流
			while ((len = br.read(inbuf)) > 0)
			{
				tembuf = new byte[nlength];
				memcpy(buf, 0, tembuf, 0, nlength);
				buf = new byte[nlength + len];
				memcpy(tembuf, 0, buf, 0, nlength);
				memcpy(inbuf, 0, buf, nlength, len);
				nlength += len;
			}
			//			System.out.println("数据下载完成。" + strurl);
			br.close();// 关闭数据流:必须关闭,否则网络连接的端口资源会被占完!!!
			if (bSaveToFile)
			{// 需要写入文件:
				strSubPath = "";
				if (nlength >= 5120 && nlength < FL_100K)
				{
				//	strSubPath = "FL_100K\\";
				}
				else if (nlength >= FL_100K && nlength < FL_500K)
				{
					strSubPath = "FL_500K\\";
				}
				else if (nlength >= FL_500K && nlength < FL_1M)
				{
					strSubPath = "FL_1M\\";
				}
				else if (nlength >= FL_1M)
				{
					strSubPath = "FL_Over1M\\";
				}
				else
				{
					strSubPath = "";
					System.out.println("------------------------>该数据长度小于指定值,丢弃。" + nlength);
				}
				if (!strSubPath.isEmpty())
					savetofile(strWritePath + strSubPath, strurl.substring(strurl.length() - 4), buf);
			}
			else
			{// 需要进行链接分析:
				if (strcharset.isEmpty())// 如果在HTTP头数据中没有寻找charset,则进一步在:
				{
					strcharset = findCharset(buf);// 网页中寻找到“charset”,并读出其指示的字符集类型
				}
				if (strcharset.isEmpty())
				{// 如果以上两步都没有找到charset,则手动指定默认值:
					strcharset = "gbk";
				}
				strhtml = new String(buf, 0, nlength, strcharset);
				// ----------------------------------------------
				// 读取需要的文件链接:
				getIMGLinkText(strhtml);
				getBackgroundLinkText(strhtml);
				getOtherUrlLink(strhtml, strcharset);
			}
		}
		catch (Exception e)
		{
			System.out.println("Error:_Writer::" + e);
		}
		// ---------------------------------------------------------------------------
		return n;
	}

	public int parseURLPath(String strurl)
	{
		int re = 0;
		int nCount = 0;
		if ((null == strurl) || (strurl.length() == 0))
			return 0;
		strurl = strurl.toLowerCase();
		try
		{
			URL url = new URL(strurl);
			urlPath[nCount++] = "http://" + url.getHost() + "/";
			String strpath = url.getPath();
			if (strpath.length() > 0)
				strpath = url.getPath().substring(1);
			while (strpath.contains("/"))
			{
				int pos = strpath.indexOf("/") + 1;
				urlPath[nCount++] = strpath.substring(0, pos);
				strpath = strpath.substring(pos);
			}
			re = nCount;
		}
		catch (Exception e)
		{
			System.out.println("Error:_HTML_parseURLPath:" + strurl + e.getMessage());
		}

		return re;
	}

	public void getIMGLinkText(String strHtml)
	{
		String line = "";
		String strBZ = "<img src=";
		String strmylinktext = "";

		int nstartpos = 0;
		int nstoppos = 0;
		int pos = 0;
		int i = 0;
		boolean bSaveToFile;
		int nCount = 0;
		int nCount2 = 0;

		if (strHtml == null || strHtml.isEmpty())
			return;
		try
		{
			strHtml = strHtml.toLowerCase();

			while (pos != -1)
			{
				pos = strHtml.indexOf(strBZ, nstoppos);
				if (-1 == pos)
					break;
				nCount++;
				char c = strHtml.charAt(pos + strBZ.length());
				if ('"' == c || '\'' == c)
					nstartpos = pos + strBZ.length() + 1;
				else
					nstartpos = pos + strBZ.length();

				for (i = nstartpos; i < strHtml.length(); i++)
				{
					if ((strHtml.charAt(i) == '"') || (strHtml.charAt(i) == '\'') || (strHtml.charAt(i) == '>') || (strHtml.charAt(i) == 0x20) || (strHtml.charAt(i) == 0x0d) || (strHtml.charAt(i) == 0x0a) || (strHtml.charAt(i) == ',') || (strHtml.charAt(i) == ';'))
					{
						break;
					}
				}
				nstoppos = i;

				strmylinktext = strHtml.substring(nstartpos, i);
				strmylinktext = completeURL(strmylinktext);
				//----------------------------------------------
				gather g = new gather();
				g.strurl = strmylinktext;
				if (strmylinktext.endsWith(".jpg") || strmylinktext.endsWith(".bmp") || strmylinktext.endsWith(".jpeg"))
				{
					if (!controler.IMGList.contains(strmylinktext))
					{
						controler.IMGList.add(strmylinktext);
						controler.MissionList.add(strmylinktext);
					}
				}
			}
		}
		catch (Exception e)
		{
			System.out.println("Error:" + e.getMessage());
		}
	}

	public void getBackgroundLinkText(String strHtml)
	{
		String line = "";
		String strBZ = "background=";
		String strmylinktext = "";

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -