📄 parseprofile.java

📁 利用多线程从搜索引擎下载网页并提取数据到数据库。
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.sql.SQLException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Stack;

import javax.swing.text.DateFormatter;

import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.lexer.Cursor;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.nodes.AbstractNode;
import org.htmlparser.util.ParserException;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.parserapplications.filterbuilder.*;
import org.htmlparser.filters.TagNameFilter;

public class ParseProfile {

	private boolean newPerson;

	LinkedList<Section> sequence;

	int nodeCount = 0;

	final static int maxNodeCount = 5000;

	public String page;

	String link;

	Profile profile;

	Stack<Job> jobs;

	Stack<School> schools;

	// org.htmlparser.nodes.AbstractNode n;
	Node n;

	org.htmlparser.Parser p;

	Lexer l;

	public boolean silent;

	public StringBuilder sb = new StringBuilder();

	/**
	 * @param args
	 * @throws IOException
	 * @throws SQLException
	 */
	public static void main(String[] args) throws IOException, SQLException {
		String fileName;
		String folder = "C:\\temp2\\";
		fileName = "149047.htm";
		String apage = Util.getAPageLocal(folder + fileName);
		ParseProfile pp = new ParseProfile(new Profile(DataAccess
				.getNewConnection(), 1, apage));
		try {
			pp.parseAndSaveToDB();
		} catch (SQLException e) {
			e.printStackTrace();
		}
		pp.parse();
		pp.saveText(folder);

	}

	public ParseProfile(Profile profile2) {
		this.profile = profile2;
		this.page = profile2.person.profile_cached_html;
		newPerson = true;
		init();
	}

	public void parseAndSaveToDB() throws SQLException {
		if (parse()) {
			saveToDB();
		}
	}

	private boolean validate() {
		PersonSection ps = new PersonSection(false);
		return ps.sectionExists();
	}

	public void saveToFile(String folder) {
		saveHtml(folder);
		saveText(folder);
	}

	public void saveText(String folder) {
		profile.appendTo(sb);
		Util.saveAFile(folder + profile.person.id + ".txt", sb.toString());
	}

	public void saveHtml(String folder) {
		Util.saveAFile(folder + profile.person.id + ".htm", page);
	}

	public void saveToDB() throws SQLException {
		if (newPerson)
			profile.insert();
		else
			profile.update();
	}

	private void reversePos() {
		int pos;
		int total;
		total = jobs.size();
		for (pos = 1; pos <= total; pos++) {
			Job job = jobs.pop();
			job.position = pos;
			profile.jobs.add(job);
		}

		total = schools.size();
		for (pos = 1; pos <= total; pos++) {
			School school = schools.pop();
			school.position = pos;
			profile.schools.add(school);
		}
	}

	public Profile getProfile() {
		return profile;
	}

	public boolean parse() throws SQLException {

		if (!(validate()))
			return false;

		while (sequence.peek() != null && (n) != null
				&& nodeCount < maxNodeCount) {
			nodeCount++;
			sequence.peek().parse();
		}
		reversePos();
		return true;
	}

	private boolean init() {
		p = new Parser();

		try {
			p.setInputHTML(page);
			l = p.getLexer();
			n = (AbstractNode) l.nextNode();
		} catch (ParserException e) {
			e.printStackTrace();
			return false;
		}
		JobSection js = new JobSection();
		SchoolSection ss = new SchoolSection();
		boolean jobVCardExists = js.sectionExists();
		boolean schoolVCardExists = ss.sectionExists();

		sequence = new LinkedList<Section>();

		sequence
				.addFirst(new PersonSection(jobVCardExists || schoolVCardExists));

		SummarySection sumSec = new SummarySection();
		if (sumSec.sectionExists())
			sequence.add(sumSec);

		if (jobVCardExists)
			sequence.add(new JobSection());

		if (schoolVCardExists)
			sequence.add(new SchoolSection());

		AdditionalSection as = new AdditionalSection();
		if (as.sectionExists())
			sequence.add(as);

		jobs = new Stack<Job>();
		schools = new Stack<School>();
		return true;
	}

	class Section {
		public ArrayList<String> separators = new ArrayList<String>(2);

		public ArrayList<String> firstFieldKeys = new ArrayList<String>(2);

		// public ArrayList<String> sectionStartKeys = new ArrayList<String>(2);
		public String sectionStartKey;

		TagNameFilter tnf = new TagNameFilter();

		public void parse() throws SQLException {
		};

		public boolean navigateToSection() {
			String attValueFound = "";
			try {
				while ((n = l.nextNode()) != null && nodeCount < maxNodeCount) {
					nodeCount++;
					if (n instanceof Tag) {
						if ((attValueFound = ((Tag) n).getAttribute("id")) != null)
							if (attValueFound.equals(sectionStartKey)) {
								return true;
							}
					}
				}
			} catch (ParserException e) {
				e.printStackTrace();
			}
			return false;
		}

		public boolean sectionExists() {
			String sID = "<div id=" + sectionStartKey + ">";
			String altID = "<div id=\"" + sectionStartKey + "\">";
			return (page.contains(sID) || page.contains(altID));
		}

		protected boolean isASeperator(String nodeText) {
			Section section = sequence.peek();
			if (section == null)
				return true;
			return equalKeyword(nodeText, section.separators);
		}

		private boolean equalKeyword(String nodeText, ArrayList<String> keywords) {
			for (String s : keywords) {
				if (nodeText.equalsIgnoreCase(s))
					return true;
			}
			return false;
		}

		protected void getTextsByTagAtts(Hashtable<String, String> ht,
				String stopTag) {
			String tagAttValue = "";
			String textValue = "";
			while (nodeCount < maxNodeCount && n != null) {

				if (n instanceof Tag) {
					if (n.getText().equals(stopTag))
						break;
					tagAttValue = ((Tag) n).getAttribute("class");
					if (tagAttValue != null) {
						if (tagAttValue.contains("headline title"))
							tagAttValue = "headline title";
						if (tagAttValue.contains("recommendation-count"))
							tagAttValue = "recommendation-count";
						if (ht.containsKey(tagAttValue)) {
							while (n instanceof Tag) {
								try {
									n = l.nextNode();
								} catch (ParserException e) {
									// TODO Auto-generated catch block
									e.printStackTrace();
								}
								nodeCount++;
							}
							textValue = n == null ? "" : Util.cleanString(n
									.getText());
							ht.put(tagAttValue, textValue);
						}
					}
				}
				try {
					n = l.nextNode();
				} catch (ParserException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				nodeCount++;
			}
		}

		protected String getDateTag(String pattern, int next) {
			String tagName = "";
			String dateString = "";
			int count = 0;
			do {
				try {
					n = l.nextNode();
					nodeCount++;
				} catch (ParserException e) {
					e.printStackTrace();
				}
				if (n instanceof Tag) {
					tagName = ((Tag) n).getAttribute("class");
					if (tagName != null) {
						if (tagName.equalsIgnoreCase(pattern)) {
							dateString = ((Tag) n).getAttribute("title");
							break;
						}
						if (tagName.equals("dtstamp"))
							break;
					}
				}
			} while (count++ < next);
			try {
				n = l.nextNode();
			} catch (ParserException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			return dateString;
		}

		protected String getTextByTag(String tag, String stopTag) {
			String textFound = "";

			try {
				n = l.nextNode();
				do {
					if (n instanceof Tag) {
						if (n.getText().equals(stopTag)
								|| isASeperator(n.getText()))
							break;
						if ((((Tag) n).getText()).equalsIgnoreCase(tag)) {
							while (n instanceof Tag) {
								n = l.nextNode();
								nodeCount++;
							}
							textFound = n == null ? "" : n.getText();
							break;
						}
					}
					nodeCount++;
				} while ((n = l.nextNode()) != null && nodeCount < maxNodeCount);
			} catch (ParserException e) {
				e.printStackTrace();
			}
			return textFound;

		}

		// this method compair tag attrib with less strict way, using string's
		// contains mothod
		// and it checks the stop signal, for non-critical field this method is
		// flexible
		protected String getTextByTagAtt2(String attValue, String stopTag) {
			String attValueFound = "";
			String textFound = "";
			try {
				n = l.nextNode();
				do {
					nodeCount++;
					if (n instanceof Tag) {
						if (n.getText().equalsIgnoreCase(stopTag)
								|| isASeperator(n.getText()))
							break;
						if ((attValueFound = ((Tag) n).getAttribute("class")) != null)
							if (attValueFound.contains(attValue)) {
								while (n instanceof Tag) {
									n = l.nextNode();
									nodeCount++;
								}
								textFound = n == null ? "" : n.getText();
								break;
							}
					}

				} while ((n = l.nextNode()) != null && nodeCount < maxNodeCount);
			} catch (ParserException e) {
				e.printStackTrace();
			}
12 下一页
💿 文件大小 43 K
👤 上传用户 zl357159
📂 所属分类 Java编程
🏷️ 相关标签

#多线程 #搜索引擎 #页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -