📄 emailtokenizer.java
字号:
if (urlList != null)
{
String[] urls = (String[]) urlList.toArray (new String[urlList.size ()]);
String[] all = null;
if (tokens != null && urls != null)
{
all = new String[tokens.length + urls.length];
System.arraycopy (tokens, 0, all, 0, tokens.length);
System.arraycopy (urls, 0, all, tokens.length, urls.length);
}
else if (urls == null)
{
all = tokens;
}
else
{
all = urls;
}
return all;
}
else
{
return tokens;
}
}
}
catch (IOException e)
{
throw new JasenException(e);
}
catch (MessagingException e)
{
throw new JasenException(e);
}
}
/*
* (non-Javadoc)
* @see org.jasen.interfaces.MimeMessageTokenizer#tokenize(javax.mail.internet.MimeMessage, org.jasen.interfaces.JasenMessage, org.jasen.interfaces.ParserData)
*/
public String[] tokenize(MimeMessage mail, JasenMessage message, ParserData data) throws JasenException {
String text = null;
String html = null;
String realHtml = null;
String realText = null;
String tokenizee = null;
realHtml = message.getHtmlPart();
realText = message.getTextPart();
html = data.getHtmlAsText();
text = data.getTextParsed();
if (html != null && html.trim().length () > 0)
{
tokenizee = html;
}
else if (text != null && text.trim().length () > 0)
{
tokenizee = text;
}
return tokenize(mail, realHtml, realText, text, html, tokenizee, data.getTokenErrorRecorder());
}
/**
* We won't use the Collection.contains method because we want to ignore
* case
*
* @param header
* @return
*/
private boolean ignoreHeader(String header) {
return (Arrays.binarySearch (IGNORED_HEADERS, header) > -1 || header.startsWith("x"));
}
private boolean includeHeader(String header) {
return (Arrays.binarySearch (INCLUDED_HEADERS, header) > -1);
}
/**
* Gets the maximum number of linguistic errors tolerated before tokenization is aborted.
* <P>
* The tokenizer uses the LinguisticAnalyzer to determine if each token is a real word. After
* linguisticLimit tokens have successively failed, tokenization is aborted.
* </P>
* @return Returns the linguisticLimit.
*/
public int getLinguisticLimit() {
return linguisticLimit;
}
/**
* Sets the maximum number of linguistic errors tolerated before tokenization is aborted.
* @param linguisticLimit The linguisticLimit to set.
* @see EmailTokenizer#getLinguisticLimit()
*/
public void setLinguisticLimit(int linguisticLimit) {
this.linguisticLimit = linguisticLimit;
}
/**
* Tells us if we are ignoring the list of IGNORED_HEADERS when tokenizing
* @return True if the tokenizer is ignoring headers in the IGNORED_HEADERS set
* @see EmailTokenizer#IGNORED_HEADERS
*/
public boolean isIgnoreHeaders() {
return ignoreHeaders;
}
/**
* Flags the tokenizer to ignore list of IGNORED_HEADERS when tokenizing
* @param b
*/
public void setIgnoreHeaders(boolean b) {
ignoreHeaders = b;
}
/**
* Gets the maximum number of tokens extracted before tokenization is aborted
* @return The maximum number if tokens that will be returned
*/
public int getTokenLimit() {
return tokenLimit;
}
/**
* Sets the maximum number of tokens extracted before tokenization is aborted
* @param i
*/
public void setTokenLimit(int i) {
tokenLimit = i;
if (tokenizer != null)
{
tokenizer.setMaxTokens (i);
}
}
/**
* Internal test harness only. DO NOT USE
* @param args
*/
public static void main(String[] args) {
try
{
//File mailDir = new
// File("D:\\Projects\\Synetek\\EverySpam\\DebugTests");
//File mailDir = new
// File("D:\\Projects\\Synetek\\EverySpam\\DebugTests_Special");
//File mailDir = new
// File("D:\\Projects\\Synetek\\EverySpam\\DebugTests_Source");
File mailDir = new File ("D:\\Projects\\Synetek\\Service\\EveryMail\\core\\poll");
SMTPMessage mail = null;
File[] files = mailDir.listFiles ();
File output = new File ("c:/output.txt");
if (output.exists ())
{
output.delete ();
}
FileOutputStream fout = new FileOutputStream (output);
FileInputStream fin = null;
StandardMimeMessageParser parser = null;
JasenMessage jm = null;
PrintWriter writer = new PrintWriter (fout);
for (int i = 0; i < files.length; i++)
{
if (files[i].isFile ())
{
try
{
writer.println ("*************************************************");
writer.println ("File " + (i + 1) + ": " + files[i].getName ());
writer.println ("*************************************************");
fin = new FileInputStream(files[i]);
mail = new SMTPMessage (null, fin);
parser = new StandardMimeMessageParser();
jm = parser.parse(mail);
writer.println ("HTML: " + jm.getHtmlPart());
EmailTokenizer et = new EmailTokenizer ();
et.setIgnoreHeaders (true);
long time = System.currentTimeMillis ();
String[] tokens = et.tokenize (mail, jm, null);
if (tokens != null)
{
//Arrays.sort(tokens);
/**
* Note to self:
* In order to determine if a token is to be included in the list of tokens we are using to train the spam filter,
* We test it first against the dictionary, then if no match was found we test it against the lexical analyzer.
* If it fails the lex test we record the failure. If the next token is valid, the failure returns to zero.
* Only after a certain threshold of successive failures has been reached do we deem the token "invalid".
* Once the threshold has been reach, successive failures are treated as true failures immediately
* This relies on the premise that tokens are listed in the order they appear in the mail, and "most" invalid
* tokens occur at the end of the message.
*/
LinguisticAnalyzer.getInstance ();
double threshold = 0.1d;
double prob;
for (int j = 0; j < tokens.length; j++)
{
writer.println ("TOKEN: [" + tokens[j] + "]");
prob = LinguisticAnalyzer.getInstance ().getWordScore (tokens[j]);
/* if (prob >= threshold)
{
System.out.println ("Yes:" + tokens[j] + ": " + prob);
}
else
{
System.out.println ("NO:" + tokens[j] + ": " + prob);
}*/
}
}
}
catch (Exception e)
{
e.printStackTrace ();
}
}
System.out.println ("Processed " + (i + 1) + "/" + files.length);
}
fout.flush ();
writer.flush ();
fout.close ();
writer.close ();
}
catch (Exception ex)
{
ex.printStackTrace ();
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -