📄 parseprofile.java
字号:
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.sql.SQLException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Stack;
import javax.swing.text.DateFormatter;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.lexer.Cursor;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.nodes.AbstractNode;
import org.htmlparser.util.ParserException;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.parserapplications.filterbuilder.*;
import org.htmlparser.filters.TagNameFilter;
public class ParseProfile {
private boolean newPerson;
LinkedList<Section> sequence;
int nodeCount = 0;
final static int maxNodeCount = 5000;
public String page;
String link;
Profile profile;
Stack<Job> jobs;
Stack<School> schools;
// org.htmlparser.nodes.AbstractNode n;
Node n;
org.htmlparser.Parser p;
Lexer l;
public boolean silent;
public StringBuilder sb = new StringBuilder();
/**
* @param args
* @throws IOException
* @throws SQLException
*/
public static void main(String[] args) throws IOException, SQLException {
String fileName;
String folder = "C:\\temp2\\";
fileName = "149047.htm";
String apage = Util.getAPageLocal(folder + fileName);
ParseProfile pp = new ParseProfile(new Profile(DataAccess
.getNewConnection(), 1, apage));
try {
pp.parseAndSaveToDB();
} catch (SQLException e) {
e.printStackTrace();
}
pp.parse();
pp.saveText(folder);
}
public ParseProfile(Profile profile2) {
this.profile = profile2;
this.page = profile2.person.profile_cached_html;
newPerson = true;
init();
}
public void parseAndSaveToDB() throws SQLException {
if (parse()) {
saveToDB();
}
}
private boolean validate() {
PersonSection ps = new PersonSection(false);
return ps.sectionExists();
}
public void saveToFile(String folder) {
saveHtml(folder);
saveText(folder);
}
public void saveText(String folder) {
profile.appendTo(sb);
Util.saveAFile(folder + profile.person.id + ".txt", sb.toString());
}
public void saveHtml(String folder) {
Util.saveAFile(folder + profile.person.id + ".htm", page);
}
public void saveToDB() throws SQLException {
if (newPerson)
profile.insert();
else
profile.update();
}
private void reversePos() {
int pos;
int total;
total = jobs.size();
for (pos = 1; pos <= total; pos++) {
Job job = jobs.pop();
job.position = pos;
profile.jobs.add(job);
}
total = schools.size();
for (pos = 1; pos <= total; pos++) {
School school = schools.pop();
school.position = pos;
profile.schools.add(school);
}
}
public Profile getProfile() {
return profile;
}
public boolean parse() throws SQLException {
if (!(validate()))
return false;
while (sequence.peek() != null && (n) != null
&& nodeCount < maxNodeCount) {
nodeCount++;
sequence.peek().parse();
}
reversePos();
return true;
}
private boolean init() {
p = new Parser();
try {
p.setInputHTML(page);
l = p.getLexer();
n = (AbstractNode) l.nextNode();
} catch (ParserException e) {
e.printStackTrace();
return false;
}
JobSection js = new JobSection();
SchoolSection ss = new SchoolSection();
boolean jobVCardExists = js.sectionExists();
boolean schoolVCardExists = ss.sectionExists();
sequence = new LinkedList<Section>();
sequence
.addFirst(new PersonSection(jobVCardExists || schoolVCardExists));
SummarySection sumSec = new SummarySection();
if (sumSec.sectionExists())
sequence.add(sumSec);
if (jobVCardExists)
sequence.add(new JobSection());
if (schoolVCardExists)
sequence.add(new SchoolSection());
AdditionalSection as = new AdditionalSection();
if (as.sectionExists())
sequence.add(as);
jobs = new Stack<Job>();
schools = new Stack<School>();
return true;
}
class Section {
public ArrayList<String> separators = new ArrayList<String>(2);
public ArrayList<String> firstFieldKeys = new ArrayList<String>(2);
// public ArrayList<String> sectionStartKeys = new ArrayList<String>(2);
public String sectionStartKey;
TagNameFilter tnf = new TagNameFilter();
public void parse() throws SQLException {
};
public boolean navigateToSection() {
String attValueFound = "";
try {
while ((n = l.nextNode()) != null && nodeCount < maxNodeCount) {
nodeCount++;
if (n instanceof Tag) {
if ((attValueFound = ((Tag) n).getAttribute("id")) != null)
if (attValueFound.equals(sectionStartKey)) {
return true;
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
return false;
}
public boolean sectionExists() {
String sID = "<div id=" + sectionStartKey + ">";
String altID = "<div id=\"" + sectionStartKey + "\">";
return (page.contains(sID) || page.contains(altID));
}
protected boolean isASeperator(String nodeText) {
Section section = sequence.peek();
if (section == null)
return true;
return equalKeyword(nodeText, section.separators);
}
private boolean equalKeyword(String nodeText, ArrayList<String> keywords) {
for (String s : keywords) {
if (nodeText.equalsIgnoreCase(s))
return true;
}
return false;
}
protected void getTextsByTagAtts(Hashtable<String, String> ht,
String stopTag) {
String tagAttValue = "";
String textValue = "";
while (nodeCount < maxNodeCount && n != null) {
if (n instanceof Tag) {
if (n.getText().equals(stopTag))
break;
tagAttValue = ((Tag) n).getAttribute("class");
if (tagAttValue != null) {
if (tagAttValue.contains("headline title"))
tagAttValue = "headline title";
if (tagAttValue.contains("recommendation-count"))
tagAttValue = "recommendation-count";
if (ht.containsKey(tagAttValue)) {
while (n instanceof Tag) {
try {
n = l.nextNode();
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
nodeCount++;
}
textValue = n == null ? "" : Util.cleanString(n
.getText());
ht.put(tagAttValue, textValue);
}
}
}
try {
n = l.nextNode();
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
nodeCount++;
}
}
protected String getDateTag(String pattern, int next) {
String tagName = "";
String dateString = "";
int count = 0;
do {
try {
n = l.nextNode();
nodeCount++;
} catch (ParserException e) {
e.printStackTrace();
}
if (n instanceof Tag) {
tagName = ((Tag) n).getAttribute("class");
if (tagName != null) {
if (tagName.equalsIgnoreCase(pattern)) {
dateString = ((Tag) n).getAttribute("title");
break;
}
if (tagName.equals("dtstamp"))
break;
}
}
} while (count++ < next);
try {
n = l.nextNode();
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return dateString;
}
protected String getTextByTag(String tag, String stopTag) {
String textFound = "";
try {
n = l.nextNode();
do {
if (n instanceof Tag) {
if (n.getText().equals(stopTag)
|| isASeperator(n.getText()))
break;
if ((((Tag) n).getText()).equalsIgnoreCase(tag)) {
while (n instanceof Tag) {
n = l.nextNode();
nodeCount++;
}
textFound = n == null ? "" : n.getText();
break;
}
}
nodeCount++;
} while ((n = l.nextNode()) != null && nodeCount < maxNodeCount);
} catch (ParserException e) {
e.printStackTrace();
}
return textFound;
}
// this method compair tag attrib with less strict way, using string's
// contains mothod
// and it checks the stop signal, for non-critical field this method is
// flexible
protected String getTextByTagAtt2(String attValue, String stopTag) {
String attValueFound = "";
String textFound = "";
try {
n = l.nextNode();
do {
nodeCount++;
if (n instanceof Tag) {
if (n.getText().equalsIgnoreCase(stopTag)
|| isASeperator(n.getText()))
break;
if ((attValueFound = ((Tag) n).getAttribute("class")) != null)
if (attValueFound.contains(attValue)) {
while (n instanceof Tag) {
n = l.nextNode();
nodeCount++;
}
textFound = n == null ? "" : n.getText();
break;
}
}
} while ((n = l.nextNode()) != null && nodeCount < maxNodeCount);
} catch (ParserException e) {
e.printStackTrace();
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -