📄 trainbogo.sh
字号:
#!/bin/sh# Train bogofilter from a ham and spam corpus## Copyright 2003 by Trevor Harrison (trevor-trainbogo@harrison.org)## This file is released under the GPL. See http://www.gnu.org/licenses/gpl.txt# $Id: trainbogo.sh,v 1.6 2004/05/30 20:25:16 m-a Exp $ ## Note: this script has not yet had bogofilter maintainer review.# Security concerned people should not run it if in doubt about its security.usage(){ echo "USAGE:" echo echo " trainbogo.sh [options]" echo echo "OPTIONS:" echo echo " Required arguments:" echo " -H hamdir points to directory with all your ham" echo " -S spamdir points to directory will all your spam" echo echo " Optional arguments:" echo " -s statdir directory where stat and tmp files are created." echo " default is ./stats.tmp" echo " -b pathtobogofilter points to the bogofilter executable," echo " with any bogofilter options you need." echo " ex. -b \"/usr/local/bin/bogofilter -d /etc/bogodb\"" echo " -f force rebuild of ham and spam directory index. Will" echo " cause msgs to be sorted into new order unless" echo " -p and -t are used." echo " -c cleanup statdir when done. (default is not to)" echo " -p rndseed specify the pid.timestamp used to randomize the msgs." echo " ex. -p 5432.1049498805" echo " -m don't test or train bogofilter, just show cached stats." echo " -n don't train bogofilter, just test." echo " -q don't show stats or dots. (quiet)" echo " -h show help." echo}help(){ echo "trainbogo.sh" echo echo " Train bogofilter from a qmail maildir type ham and spam corpus" echo echo " This script relies on you having seperated your qmail maildir messages into" echo " ham and spam directories. This script randomizes the message order, and" echo " then feeds each message in turn into bogofilter, noting if bogofilter" echo " correctly identified the message as ham or spam. If mis-identified, it" echo " trains bogofilter with that message, and then re-tests to see if bogofilter" echo " correctly identifies the message." echo echo " When I've used this script on my ham/spam collection, it takes about 4" echo " consecutive executions to get my wordlists to a 0 false positive state." echo " Just because this script reports 0 failed trainings doesn't mean that you" echo " are ready to go. Run the script a second time to make sure. You should" echo " keep running the script until you get 0 misdetections and, of course, 0" echo " retrain failed's." echo echo " While running, trainbogo.sh will write some dots and dashes to the screen." echo echo " . = successfully categorized the message." echo " - = failed to categorized the message, and training was turned off (-n)." echo " + = successfully categorized the message after being retrained." echo " f = failed to categorize the message after training." echo echo " The results of the testing can be found in the statsdir. Log files have" echo " the filename of each message that match the logfile name:" echo echo " trainbogo.log.[0,1].[success,fail]" echo " 0 = spam message log" echo " 1 = ham message log" echo " success/fail = were/weren't correctly categorized." echo usage}verbose(){ [ -n "${verbose}" ] && echo $@}normal(){ [ -z "${quiet}" ] && echo $@}normaln(){ [ -z "${quiet}" ] && printf "%s" "$*"}cleanup(){ verbose "Performing cleanup" [ -z "${log}" ] || [ -z "${list}" ] || [ "${docleanup}" != "y" ] && return rm -f ${log}.[01].success ${log}.[01].fail \ ${log}.[01].train.success ${log}.[01].train.fail \ ${list} [ "${madestatsdir}" = "y" ] && [ -n "${statsdir}" ] && rmdir --ignore-fail-on-non-empty "${statsdir}"}dofilelist=dotrain=ydotest=ydocleanup=verbose=quiet=statsdir="${PWD}/stats.tmp/"origstatsdir="${statsdir}"bf=bogofilterwhile getopts "H:S:s:b:p:fcmnqvh" optname; do case "${optname}" in "H") hamdir="$OPTARG" ;; "S") spamdir="$OPTARG" ;; "s") statsdir="$OPTARG" ;; "b") bf="$OPTARG";; "f") dofilelist=y ;; "c") docleanup=y ;; "p") rndseed=$OPTARG ;; "m") dotest= ; dotrain= ;; "n") dotrain= ;; "q") quiet=y ;; "v") verbose=y ;; "h") help; exit ;; esacdone# Check for required options[ -z "${hamdir}" ] || [ ! -d "${hamdir}" ] && echo "Missing or bad -H option" && usage && exit[ -z "${spamdir}" ] || [ ! -d "${spamdir}" ] && echo "Missing or bad -S option" && usage && exit[ -z "${statsdir}" ] && echo "Bad statsdir option" && usage && exit# make the stats dir if its missing, but only if its the default stats dir and not user specified[ "${statsdir}" = "${origstatsdir}" ] && [ ! -d "${statsdir}" ] && mkdir "${statsdir}" && madestatsdir=y[ ! -d "${statsdir}" ] && echo "Missing statsdir (-s option)" && exit# check for bogofilterbfbin=$(which ${bf%% *})[ $? -ne 0 ] && echo "Missing bogofilter, not in path? (${bf})" && exit[ ! -x "${bfbin}" ] && echo "Missing or bad bogofilter binary! (${bf})" && exitlist="${statsdir}/trainbogo.filenames.txt"log="${statsdir}/trainbogo.log"# Init log filesif [ ! -f "${log}.0.success" ] || [ -n "${dotest}" ] || [ -n "${dotrain}" ] ; then verbose "init log files" >"${log}.0.success" >"${log}.1.success" >"${log}.0.fail" >"${log}.1.fail" >"${log}.0.train.success" >"${log}.0.train.fail" >"${log}.1.train.success" >"${log}.1.train.fail"fi# First make a randomly sorted list of all the ham and spam files (if needed)if [ ! -f "${list}" ] || [ -n "${dofilelist}" ]; then # MD5 all the spam and ham [ -z "${rndseed}" ] && rndseed="$$.$(date +%s)" normal "MD5'ing ham and spam corpus, rndseed used: ${rndseed}" >"${list}" for i in "${hamdir}"/* "${spamdir}"/* do [ ! -f "${i}" ] && continue md5=$(printf "%s" "${rndseed}${i}" | md5sum | sed "s/ -//") echo "${md5} ${i}" >> "${list}" done [ $(wc -l < "${list}") -eq 0 ] && echo "No files to work on!!!" && exit # This randomizes the file names by sorting on the md5 hash normal "Randomizing ham and spam" sort "${list}" > "${list}.tmp" mv -f "${list}.tmp" "${list}" # Drop the hash sed "s/^.\{32\} \(.*\)/\1/" < "${list}" > "${list}.tmp" mv -f "${list}.tmp" "${list}" # Put expected bogofilter error levels in front of each filename # Using @'s for sed's rule delimiter because ${hamdir} can have /'s. # Hopefully there won't be any @'s in the ham/spam dir name. sed "s@^${hamdir}\(.*\)@1 ${hamdir}\\1@g; s@^${spamdir}\(.*\)@0 ${spamdir}\\1@g" < "${list}" > "${list}.tmp" mv -f "${list}.tmp" "${list}"fi# Read each filename from the filelist and test and train bogofilter.if [ -n "${dotest}" ] || [ -n "${dotrain}" ]; then normal "Training bogofilter" (while read spamstatus fname do normaln "${lastdot}" bogotest=$(${bf} -v < "${fname}") ret=$? if [ ${spamstatus} -eq ${ret} ]; then # bogofilter detected this message correctly echo "${fname}" >> "${log}.${spamstatus}.success" lastdot="." continue fi # Bogofilter failed to detect the msg correctly echo "${fname}" >> "${log}.${spamstatus}.fail" lastdot="-" [ -z "${dotrain}" ] && continue # Set the bogofilter option for training if [ ${spamstatus} -eq 0 ]; then bfopt="-s" else bfopt="-n" fi # Train bogofilter ${bf} ${bfopt} < "${fname}" # Test again bogotest=$(${bf} -v < "${fname}") ret=$? # Did it train successfully? if [ ${spamstatus} -eq ${ret} ]; then testresult="success" lastdot="+" else testresult="fail" lastdot="f" fi # Log train result echo "${fname}" >> "${log}.${spamstatus}.train.${testresult}" done) < ${list}fiechoechoif [ -z "${quiet}" ]; then total_msg=$(wc -l < "${list}") total_ham_msg=$(ls "${hamdir}" | wc -l) total_ham_success=$(wc -l < "${log}.1.success") total_ham_fail=$(wc -l < "${log}.1.fail") total_ham_train_fail=$(wc -l < "${log}.1.train.fail") total_spam_msg=$(ls "${spamdir}" | wc -l) total_spam_success=$(wc -l < "${log}.0.success") total_spam_fail=$(wc -l < "${log}.0.fail") total_spam_train_fail=$(wc -l < "${log}.0.train.fail") echo "Total messages: ${total_msg}" echo echo "Total ham: ${total_ham_msg}" echo "Misdetected ham: ${total_ham_fail}" [ -n "${dotrain}" ] && echo " retrain fail: ${total_ham_train_fail}" echo echo "Total spam: ${total_spam_msg}" echo "Misdetected spam: ${total_spam_fail}" [ -n "${dotrain}" ] && echo " retrain fail: ${total_spam_train_fail}" echofinormal "Done"cleanup# done
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -