⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 parseprofile.java

📁 利用多线程从搜索引擎下载网页并提取数据到数据库。
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
			return textFound;

		}

		/**
		 * 
		 * @param tagAttName
		 *            "id" or "class" or anything.
		 * @param pattern
		 *            the att value to hit.
		 * @return the text node right after the hit tagnode, after clear some
		 *         tagnodes prior to it it compares tag attrib in a trict way,
		 *         using equals method.
		 */
		protected String getTextByTagAtt(String tagAttName, String pattern) {
			String s = "";
			String tagAttValue;
			try {
				n = l.nextNode();
				while (n != null && nodeCount < maxNodeCount) {
					s = n.getText();
					if (isASeperator(s)) {
						// sequence.poll();
						n = l.nextNode();
						return "";
					}

					if (n instanceof Tag) {
						tagAttValue = ((Tag) n).getAttribute(tagAttName);
						if (tagAttValue != null) {
							if (tagAttValue.equalsIgnoreCase(pattern)) {
								while (n instanceof Tag) {
									n = l.nextNode();
									nodeCount++;
								}
								s = n == null ? "" : n.getText();
								return s;
							}
						}
					}
					n = l.nextNode();
					nodeCount++;
				}

			} catch (ParserException e) {
				e.printStackTrace();
				return "";
			}
			return s;
		}

		protected String getMultiLineRaw(String startString, String endString) {
			String firstField = null;
			int start = page.indexOf(startString, l.getPosition());
			int end = page.indexOf(endString, l.getPosition());
			firstField = page.substring(start, end);
			firstField = firstField.toString().replaceAll("<BR>", "\n");
			firstField = firstField.replaceAll("<br>", "\n");
			return firstField;

		}

		protected String getMultiLine() {
			StringBuilder sb = new StringBuilder();
			do {
				try {
					n = l.nextNode();
				} catch (ParserException e) {
					e.printStackTrace();
				}
				if (n instanceof Text) {
					sb.append(n.getText());
				}
				if (n instanceof Tag)
					if (((Tag) n).getTagName().equalsIgnoreCase("P"))
						break;

			} while (n != null && nodeCount < maxNodeCount);

			return Util.cleanString(sb.toString());
		}

	}

	class JobSection extends Section {
		public JobSection() {
			sectionStartKey = "experience";
			super.separators.add("HR");
			super.separators.add("hr /");
		}

		public void parse() throws SQLException {
			if (!super.navigateToSection())
				return;

			String firstField;

			while ((firstField = getTextByTagAtt("class", "title")) != "") {

				Job job = new Job(profile.person);
				job.role = Util.cleanString(firstField);
				job.organization_name = Util.cleanString(getTextByTagAtt(
						"class", "org summary"));
				String[] orgDetails = Util.cleanString(
						getTextByTagAtt("class", "organization-details"))
						.split(";");

				for (int i = 0; i < orgDetails.length; i++)
					orgDetails[i] = Util.cleanString(orgDetails[i]);

				if (orgDetails.length == 4) {
					job.organization_type = orgDetails[0];
					job.organization_size = orgDetails[1];
					job.organization_stock_ticker = orgDetails[2];
					job.industry = orgDetails[3];
				}
				if (orgDetails.length < 4) {
					int i;
					for (i = 0; i < orgDetails.length; i++) {
						if (orgDetails[i].contains("industry"))
							job.industry = orgDetails[i];
						else if (orgDetails[i].contains("employees")
								|| orgDetails[i].equalsIgnoreCase("Myself Only"))
							job.organization_size = orgDetails[i];
						else if (orgDetails[i].toUpperCase().equals(
								orgDetails[i]))
							job.organization_stock_ticker = orgDetails[i];
						else
							job.organization_type = orgDetails[i];
					}
				}

				job.start_date = Util.parseDate(getDateTag("dtstart", 5));
				job.end_date = Util.parseDate(getDateTag("dtend", 5));
				String act = getTextByTagAtt2("description", "/li");
				if (!(act.equals(""))) {
					job.description = Util.cleanString(act) + "\n"
							+ getMultiLine();
				}
				if (job.role != "")
					jobs.push(job);
			}
			sequence.poll();
		}

	}

	class SchoolSection extends Section {
		public SchoolSection() {
			super.sectionStartKey = "education";
			super.separators.add("HR");
			super.separators.add("hr /");
		}

		public void parse() {
			if (!super.navigateToSection())
				return;
			String firstField;

			while ((firstField = getTextByTagAtt("class", "summary fn org")) != "") {
				School school = new School(profile.person);
				school.university = Util.cleanString(firstField);
				int pos = l.getPosition();
				school.degree = Util.cleanString(getTextByTagAtt2("degree",
						"/li"));
				l.setPosition(pos);
				school.major = Util
						.cleanString(getTextByTagAtt2("major", "/li"));
				l.setPosition(pos);
				school.start_date = Util.parseDate(getDateTag("dtstart", 20));
				l.setPosition(pos);
				school.end_date = Util.parseDate(getDateTag("dtend", 40));
				l.setPosition(pos);
				String act = getTextByTagAtt2("activities-societies", "/li");
				if (!(act.equals(""))) {
					int i;
					for (i = 0; i < 6; i++) {
						try {
							n = l.nextNode();
						} catch (ParserException e) {
							// TODO Auto-generated catch block
							e.printStackTrace();
						}
					}

					school.activities = n == null ? null : Util.cleanString(n
							.getText());
				}
				if (school.university != "")
					schools.push(school);
			}

			sequence.poll();
		}
	}

	class AdditionalSection extends Section {

		public AdditionalSection() {
			super.sectionStartKey = "additional-information";
			super.separators.add("HR");
			super.separators.add("hr /");
		}

		public void parse() {
			if (!super.navigateToSection())
				return;
			if (page.indexOf("Groups:", l.getPosition()) > 0) {
				parseGroups();
			}

			if (page.indexOf("Honors:", l.getPosition()) > 0) {
				parseHonors();
			}
		}

		private void parseGroups() {
			String firstField;
			getTextByTagAtt("class", "affiliation vcard");
			while ((firstField = getTextByTagAtt2("fn org", "/ul")) != "") {
				Group group = new Group(profile.person);
				group.name = Util.cleanString(firstField);
				if (!group.name.equals(""))
				profile.groups.add(group);
			}
		}

		private void parseHonors() {
			String firstField = getTextByTagAtt("class", "honors");

			if (firstField == "") {
				return;
			}
			Honor honor = new Honor(profile.person);
			honor.name = Util.cleanString(firstField);
			profile.honors.add(honor);

			String sss = getMultiLine();
			if (sss.equals(""))
				return;
			for (String s : sss.split("\n")) {
				honor = new Honor(profile.person);
				honor.name = Util.cleanString(s);
				if (!honor.name.equals(""))
				profile.honors.add(honor);
			}
		}
	}

	class SummarySection extends Section {

		public SummarySection() {
			super.sectionStartKey = "summary";
			super.separators.add("HR");
			super.separators.add("hr /");
		}

		public void parse() {
			// if (!super.tryParse())
			// return;
			sequence.poll();
			// to hit the separator and cause the action to next section
			// getNext(sectionStartKey, 0, 0);
			// getNext(sectionStartKey, 0, 0);

		}
	}

	class PersonSection extends Section {

		public boolean ignorJobSchoolOutline;

		public PersonSection(boolean ignorJobSchoolOutline) {
			sectionStartKey = "nameplate";
			separators.add("HR");
			separators.add("hr /");
			this.ignorJobSchoolOutline = ignorJobSchoolOutline;
		}

		public void parse() {
			if (!super.navigateToSection())
				return;

			int pos = l.getPosition();

			Hashtable<String, String> ht = new Hashtable<String, String>();
			ht.put("given-name", "");
			ht.put("family-name", "");
			ht.put("headline title", "");
			ht.put("locality", "");
			ht.put("country-name", "");
			ht.put("recommendation-count", "");
			ht.put("connection-count", "");
			getTextsByTagAtts(ht, "hr /");

			if (ht.get("given-name") == "")
				return;
			Person person = profile.person;

			person.first_name = ht.get("given-name");
			person.last_name = ht.get("family-name");
			person.career_heading = ht.get("headline title");
			person.location = ht.get("locality") + ht.get("country-name");
			person.recommendation_count = Util.getNumberFromString(ht
					.get("recommendation-count"));
			person.connection_count = Util.getNumberFromString(ht
					.get("connection-count"));
			l.setPosition(pos);

			if (!ignorJobSchoolOutline) {
				getTextByTagAtt("id", "overview");
				parseJobOutline();
				parseSchoolOutLine();
			}

			// to hit the separator and cause the action to next section
			// l.setPosition(pos);
			// getTextByTagAtt("id", sectionStartKey);
			sequence.poll();
		}

		private void parseJobOutline() {

			if (page.indexOf("<dt>Current</dt>", l.getPosition()) > 0)
				getSimpleJobsSub();

			if (page.indexOf("<dt>Past</dt>", l.getPosition()) > 0)
				getSimpleJobsSub();

		}

		private void getSimpleJobsSub() {
			ArrayList<String> jobsList = getList();
			if (jobsList.size() == 0)
				return;
			String jobDesp;
			String[] jobArray;
			for (String s : jobsList) {
				jobDesp = Util.cleanString(s);
				jobArray = jobDesp.split(" at ");
				if (jobArray.length == 2) {
					Job job = new Job(profile.person);
					job.role = jobArray[0];
					job.organization_name = jobArray[1];
					jobs.push(job);
				}
			}
		}

		private void parseSchoolOutLine() {

			if (page.indexOf("<dt>Education</dt>", l.getPosition()) < 0)
				return;
			ArrayList<String> schoolsList = getList();
			if (schoolsList.size() == 0)
				return;
			String schoolDesp;
			for (String s : schoolsList) {
				schoolDesp = Util.cleanString(s);
				School school = new School(profile.person);
				school.university = schoolDesp;
				schools.push(school);
			}

		}

		private ArrayList<String> getList() {
			ArrayList<String> list = new ArrayList<String>();
			String item;
			while ((item = getTextByTag("li", "/ul")) != "") {
				list.add(item);
			}
			return list;
		}

	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -