📄 options.py
字号:
"""OptionsAbstract:Options.options is a globally shared options object.This object is initialised when the module is loaded: the envarBAYESCUSTOMIZE is checked for a list of names, if nothing is foundthen the local directory and the home directory are checked for afile called bayescustomize.ini or .spambayesrc (respectively) andthe initial values are loaded from this.The Option class is defined in OptionsClass.py - this moduleis responsible only for instantiating and loading the globallyshared instance.To Do: o Suggestions?"""import sys, ostry: True, Falseexcept NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0try: _except NameError: _ = lambda arg: arg__all__ = ['options', '_']# Grab the stuff from the core options class.from OptionsClass import *# A little magic. We'd like to use ZODB as the default storage,# because we've had so many problems with bsddb, and we'd like to swap# to new ZODB problems <wink>. However, apart from this, we only need# a standard Python install - if the default was ZODB then we would# need ZODB to be installed as well (which it will br for binary users,# but might not be for source users). So what we do is check whether# ZODB is importable and if it is, default to that, and if not, default# to dbm. If ZODB is sometimes importable and sometimes not (e.g. you# muck around with the PYTHONPATH), then this may not work well - the# best idea would be to explicitly put the type in your configuration# file.try: import ZODBexcept ImportError: DB_TYPE = "dbm", "hammie.db", "spambayes.messageinfo.db"else: del ZODB DB_TYPE = "zodb", "hammie.fs", "messageinfo.fs"# Format:# defaults is a dictionary, where the keys are the section names# each key maps to a tuple consisting of:# option name, display name, default,# doc string, possible values, restore on restore-to-defaults# The display name and doc string should be enclosed in _() to allow# i18n. In a few cases, then possible values should also be enclosed# in _().defaults = { "Tokenizer" : ( ("basic_header_tokenize", _("Basic header tokenising"), False, _("""If true, tokenizer.Tokenizer.tokenize_headers() will tokenize the contents of each header field just like the text of the message body, using the name of the header as a tag. Tokens look like "header:word". The basic approach is simple and effective, but also very sensitive to biases in the ham and spam collections. For example, if the ham and spam were collected at different times, several headers with date/time information will become the best discriminators. (Not just Date, but Received and X-From_.)"""), BOOLEAN, RESTORE), ("basic_header_tokenize_only", _("Only basic header tokenising"), False, _("""If true and basic_header_tokenize is also true, then basic_header_tokenize is the only action performed."""), BOOLEAN, RESTORE), ("basic_header_skip", _("Basic headers to skip"), ("received date x-.*",), _("""If basic_header_tokenize is true, then basic_header_skip is a set of headers that should be skipped."""), HEADER_NAME, RESTORE), ("check_octets", _("Check application/octet-stream sections"), False, _("""If true, the first few characters of application/octet-stream sections are used, undecoded. What 'few' means is decided by octet_prefix_size."""), BOOLEAN, RESTORE), ("octet_prefix_size", _("Number of characters of octet stream to process"), 5, _("""The number of characters of the application/octet-stream sections to use, if check_octets is set to true."""), INTEGER, RESTORE), ("x-short_runs", _("Count runs of short 'words'"), False, _("""(EXPERIMENTAL) If true, generate tokens based on max number of short word runs. Short words are anything of length < the skip_max_word_size option. Normally they are skipped, but one common spam technique spells words like 'V I A G RA'. """), BOOLEAN, RESTORE), ("x-lookup_ip", _("Generate IP address tokens from hostnames"), False, _("""(EXPERIMENTAL) Generate IP address tokens from hostnames. Requires PyDNS (http://pydns.sourceforge.net/)."""), BOOLEAN, RESTORE), ("lookup_ip_cache", _("x-lookup_ip cache file location"), "", _("""Tell SpamBayes where to cache IP address lookup information. Only comes into play if lookup_ip is enabled. The default (empty string) disables the file cache. When caching is enabled, the cache file is stored using the same database type as the main token store (only dbm and zodb supported so far, zodb has problems, dbm is untested, hence the default)."""), PATH, RESTORE), ("x-image_size", _("Generate image size tokens"), False, _("""(EXPERIMENTAL) If true, generate tokens based on the sizes of embedded images."""), BOOLEAN, RESTORE), ("x-crack_images", _("Look inside images for text"), False, _("""(EXPERIMENTAL) If true, generate tokens based on the (hopefully) text content contained in any images in each message. The current support is minimal, relies on the installation of ocrad (http://www.gnu.org/software/ocrad/ocrad.html) and netpbm. It is almost certainly only useful in its current form on Unix-like machines."""), BOOLEAN, RESTORE), ("crack_image_cache", _("Cache to speed up ocr."), "", _("""If non-empty, names a file from which to read cached ocr info at start and to which to save that info at exit."""), PATH, RESTORE), ("ocrad_scale", _("Scale factor to use with ocrad."), 2, _("""Specifies the scale factor to apply when running ocrad. While you can specify a negative scale it probably won't help. Scaling up by a factor of 2 or 3 seems to work well for the sort of spam images encountered by SpamBayes."""), INTEGER, RESTORE), ("ocrad_charset", _("Charset to apply with ocrad."), "ascii", _("""Specifies the charset to use when running ocrad. Valid values are 'ascii', 'iso-8859-9' and 'iso-8859-15'."""), OCRAD_CHARSET, RESTORE), ("max_image_size", _("Max image size to try OCR-ing"), 100000, _("""When crack_images is enabled, this specifies the largest image to try OCR on."""), INTEGER, RESTORE), ("count_all_header_lines", _("Count all header lines"), False, _("""Generate tokens just counting the number of instances of each kind of header line, in a case-sensitive way. Depending on data collection, some headers are not safe to count. For example, if ham is collected from a mailing list but spam from your regular inbox traffic, the presence of a header like List-Info will be a very strong ham clue, but a bogus one. In that case, set count_all_header_lines to False, and adjust safe_headers instead."""), BOOLEAN, RESTORE), ("record_header_absence", _("Record header absence"), False, _("""When True, generate a "noheader:HEADERNAME" token for each header in safe_headers (below) that *doesn't* appear in the headers. This helped in various of Tim's python.org tests, but appeared to hurt a little in Anthony Baxter's tests."""), BOOLEAN, RESTORE), ("safe_headers", _("Safe headers"), ("abuse-reports-to", "date", "errors-to", "from", "importance", "in-reply-to", "message-id", "mime-version", "organization", "received", "reply-to", "return-path", "subject", "to", "user-agent", "x-abuse-info", "x-complaints-to", "x-face"), _("""Like count_all_header_lines, but restricted to headers in this list. safe_headers is ignored when count_all_header_lines is true, unless record_header_absence is also true."""), HEADER_NAME, RESTORE), ("mine_received_headers", _("Mine the received headers"), False, _("""A lot of clues can be gotten from IP addresses and names in Received: headers. This can give spectacular results for bogus reasons if your corpora are from different sources."""), BOOLEAN, RESTORE), ("address_headers", _("Address headers to mine"), ("from", "to", "cc", "sender", "reply-to"), _("""Mine the following address headers. If you have mixed source corpuses (as opposed to a mixed sauce walrus, which is delicious!) then you probably don't want to use 'to' or 'cc') Address headers will be decoded, and will generate charset tokens as well as the real address. Others to consider: errors-to, ..."""), HEADER_NAME, RESTORE), ("generate_long_skips", _("Generate long skips"), True, _("""If legitimate mail contains things that look like text to the tokenizer and turning turning off this option helps (perhaps binary attachments get 'defanged' by something upstream from this operation and thus look like text), this may help, and should be an alert that perhaps the tokenizer is broken."""), BOOLEAN, RESTORE), ("summarize_email_prefixes", _("Summarise email prefixes"), False, _("""Try to capitalize on mail sent to multiple similar addresses."""), BOOLEAN, RESTORE), ("summarize_email_suffixes", _("Summarise email suffixes"), False, _("""Try to capitalize on mail sent to multiple similar addresses."""), BOOLEAN, RESTORE), ("skip_max_word_size", _("Long skip trigger length"), 12, _("""Length of words that triggers 'long skips'. Longer than this triggers a skip."""), INTEGER, RESTORE), ("x-pick_apart_urls", _("Extract clues about url structure"), False, _("""(EXPERIMENTAL) Note whether url contains non-standard port or user/password elements."""), BOOLEAN, RESTORE), ("x-fancy_url_recognition", _("Extract URLs without http:// prefix"), False, _("""(EXPERIMENTAL) Recognize 'www.python.org' or ftp.python.org as URLs instead of just long words."""), BOOLEAN, RESTORE), ("replace_nonascii_chars", _("Replace non-ascii characters"), False, _("""If true, replace high-bit characters (ord(c) >= 128) and control characters with question marks. This allows non-ASCII character strings to be identified with little training and small database burden. It's appropriate only if your ham is plain 7-bit ASCII, or nearly so, so that the mere presence of non-ASCII character strings is known in advance to be a strong spam indicator."""), BOOLEAN, RESTORE), ("x-search_for_habeas_headers", _("Search for Habeas Headers"), False, _("""(EXPERIMENTAL) If true, search for the habeas headers (see http://www.habeas.com). If they are present and correct, this should be a strong ham sign, if they are present and incorrect, this should be a strong spam sign."""), BOOLEAN, RESTORE), ("x-reduce_habeas_headers", _("Reduce Habeas Header Tokens to Single"), False, _("""(EXPERIMENTAL) If SpamBayes is set to search for the Habeas headers, nine tokens are generated for messages with habeas headers. This should be fine, since messages with the headers should either be ham, or result in FN so that we can send them to habeas so they can be sued. However, to reduce the strength of habeas headers, we offer the ability to reduce the nine tokens to one. (This option has no effect if 'Search for Habeas Headers' is False)"""), BOOLEAN, RESTORE), ), # These options are all experimental; it seemed better to put them into # their own category than have several interdependant experimental options. # If this capability is removed, the entire section can go. "URLRetriever" : ( ("x-slurp_urls", _("Tokenize text content at the end of URLs"), False, _("""(EXPERIMENTAL) If this option is enabled, when a message normally scores in the 'unsure' range, and has fewer tokens than the maximum looked at, and contains URLs, then the text at those URLs is obtained and tokenized. If those tokens result in the message moving to a score outside the 'unsure' range, then they are added to the tokens for the message. This should be particularly effective for messages that contain only a single URL and no other text."""), BOOLEAN, RESTORE),
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -