📄 emailtokenizer.java

📁 spam source codejasen-0.9jASEN - java Anti Spam ENgine.zip 如标题所示
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12

                if (urlList != null)
                {
                    String[] urls = (String[]) urlList.toArray (new String[urlList.size ()]);
                    String[] all = null;

                    if (tokens != null && urls != null)
                    {
                        all = new String[tokens.length + urls.length];
                        System.arraycopy (tokens, 0, all, 0, tokens.length);
                        System.arraycopy (urls, 0, all, tokens.length, urls.length);
                    }
                    else if (urls == null)
                    {
                        all = tokens;
                    }
                    else
                    {
                        all = urls;
                    }

                    return all;
                }
                else
                {
                    return tokens;
                }
            }
        }
        catch (IOException e)
        {
            throw new JasenException(e);
        }
        catch (MessagingException e)
        {
            throw new JasenException(e);
        }
    }

    /*
     * (non-Javadoc)
     * @see org.jasen.interfaces.MimeMessageTokenizer#tokenize(javax.mail.internet.MimeMessage, org.jasen.interfaces.JasenMessage, org.jasen.interfaces.ParserData)
     */
    public String[] tokenize(MimeMessage mail, JasenMessage message, ParserData data) throws JasenException {
        String text = null;
        String html = null;
        String realHtml = null;
        String realText = null;
        String tokenizee = null;

        realHtml = message.getHtmlPart();
        realText = message.getTextPart();
        html = data.getHtmlAsText();
        text = data.getTextParsed();

        if (html != null && html.trim().length () > 0)
        {
            tokenizee = html;
        }
        else if (text != null && text.trim().length () > 0)
        {
            tokenizee = text;
        }

        return tokenize(mail, realHtml, realText, text, html, tokenizee, data.getTokenErrorRecorder());
    }

    /**
     * We won't use the Collection.contains method because we want to ignore
     * case
     *
     * @param header
     * @return
     */
    private boolean ignoreHeader(String header) {
        return (Arrays.binarySearch (IGNORED_HEADERS, header) > -1 || header.startsWith("x"));
    }

    private boolean includeHeader(String header) {
        return (Arrays.binarySearch (INCLUDED_HEADERS, header) > -1);
    }

    /**
     * Gets the maximum number of linguistic errors tolerated before tokenization is aborted.
     * <P>
     * The tokenizer uses the LinguisticAnalyzer to determine if each token is a real word.  After 
     * linguisticLimit tokens have successively failed, tokenization is aborted.
     * </P>
     * @return Returns the linguisticLimit.
     */
    public int getLinguisticLimit() {
        return linguisticLimit;
    }
    
    /**
     * Sets the maximum number of linguistic errors tolerated before tokenization is aborted.
     * @param linguisticLimit The linguisticLimit to set.
     * @see EmailTokenizer#getLinguisticLimit()
     */
    public void setLinguisticLimit(int linguisticLimit) {
        this.linguisticLimit = linguisticLimit;
    }
    
    /**
     * Tells us if we are ignoring the list of IGNORED_HEADERS when tokenizing
     * @return True if the tokenizer is ignoring headers in the IGNORED_HEADERS set
     * @see EmailTokenizer#IGNORED_HEADERS
     */
    public boolean isIgnoreHeaders() {
        return ignoreHeaders;
    }

    /**
     * Flags the tokenizer to ignore list of IGNORED_HEADERS when tokenizing
     * @param b
     */
    public void setIgnoreHeaders(boolean b) {
        ignoreHeaders = b;
    }

    /**
     * Gets the maximum number of tokens extracted before tokenization is aborted
     * @return The maximum number if tokens that will be returned
     */
    public int getTokenLimit() {
        return tokenLimit;
    }

    /**
     * Sets the maximum number of tokens extracted before tokenization is aborted
     * @param i
     */
    public void setTokenLimit(int i) {
        tokenLimit = i;
        if (tokenizer != null)
        {
            tokenizer.setMaxTokens (i);
        }
    }


    /**
     * Internal test harness only.  DO NOT USE
     * @param args
     */
    public static void main(String[] args) {
        try
        {

            //File mailDir = new
            // File("D:\\Projects\\Synetek\\EverySpam\\DebugTests");
            //File mailDir = new
            // File("D:\\Projects\\Synetek\\EverySpam\\DebugTests_Special");
            //File mailDir = new
            // File("D:\\Projects\\Synetek\\EverySpam\\DebugTests_Source");
            File mailDir = new File ("D:\\Projects\\Synetek\\Service\\EveryMail\\core\\poll");

            SMTPMessage mail = null;

            File[] files = mailDir.listFiles ();

            File output = new File ("c:/output.txt");

            if (output.exists ())
            {
                output.delete ();
            }

            FileOutputStream fout = new FileOutputStream (output);
            FileInputStream fin = null;

            StandardMimeMessageParser parser = null;

            JasenMessage jm = null;

            PrintWriter writer = new PrintWriter (fout);

            for (int i = 0; i < files.length; i++)
            {

                if (files[i].isFile ())
                {
                    try
                    {
                        writer.println ("*************************************************");
                        writer.println ("File " + (i + 1) + ": " + files[i].getName ());
                        writer.println ("*************************************************");

                        fin = new FileInputStream(files[i]);

                        mail = new SMTPMessage (null, fin);

                        parser = new StandardMimeMessageParser();
                        jm = parser.parse(mail);

                        writer.println ("HTML: " + jm.getHtmlPart());

                        EmailTokenizer et = new EmailTokenizer ();
                        et.setIgnoreHeaders (true);

                        long time = System.currentTimeMillis ();

                        String[] tokens = et.tokenize (mail, jm, null);

                        if (tokens != null)
                        {

                            //Arrays.sort(tokens);

                            /**
                             * Note to self:
                             * In order to determine if a token is to be included in the list of tokens we are using to train the spam filter,
                             * We test it first against the dictionary, then if no match was found we test it against the lexical analyzer.
                             * If it fails the lex test we record the failure.  If the next token is valid, the failure returns to zero.
                             * Only after a certain threshold of successive failures has been reached do we deem the token "invalid".
                             * Once the threshold has been reach, successive failures are treated as true failures immediately
                             * This relies on the premise that tokens are listed in the order they appear in the mail, and "most" invalid
                             * tokens occur at the end of the message.
                             */

                            LinguisticAnalyzer.getInstance ();

                            double threshold = 0.1d;
                            double prob;

                            for (int j = 0; j < tokens.length; j++)
                            {

                                writer.println ("TOKEN: [" + tokens[j] + "]");

                                prob = LinguisticAnalyzer.getInstance ().getWordScore (tokens[j]);

                               /* if (prob >= threshold)
                                {
                                    System.out.println ("Yes:" + tokens[j] + ": " + prob);
                                }
                                else
                                {
                                    System.out.println ("NO:" + tokens[j] + ": " + prob);
                                }*/
                            }
                        }
                    }
                    catch (Exception e)
                    {
                        e.printStackTrace ();
                    }
                }

                System.out.println ("Processed " + (i + 1) + "/" + files.length);

            }

            fout.flush ();
            writer.flush ();
            fout.close ();
            writer.close ();

        }
        catch (Exception ex)
        {
            ex.printStackTrace ();
        }

    }

}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -