📄 readonly_ws.cpp
字号:
// This file is part of The New Aspell// Copyright (C) 2000-2001 by Kevin Atkinson under the GNU LGPL// license version 2.0 or 2.1. You should have received a copy of the// LGPL license along with this library if you did not you can find it// at http://www.gnu.org/.// Aspell's main word list data is stored in 4 large blocks of memory//// * The Word Hash Table// * The Word List// * The Soundslike Hash Table// * The Soundslike List//// 1a) The Word Hash Table// This consists of an open address hash table which contains pointers// to the actual words in the word list//// 1b) The Word List// This consists of the actual word list and is layed out as follows:// <Word1><null char><Word2><null char>...//// 2a) The Soundslike Hash Table// This consists of an open address hash table which contains pointers// to a soundslike object.//// 2b) The Soundslike Object// The soundslike object is layed out as follow:// What: <Word1 pointer><Word2 p.>...<Num of Words><Soundslike><null char>// Types: <const char *><const char *>...<unsigned short int><char[]><char>// <unsigned int><unsigned int>...<unsigned short int><char[]><char>// The pointer to the object points to the beginning of the Soundslike string// The Word pointers consists of the the words which have the same // soundslike pattern//// 2c) The Soundslike List// This consists of Soundslike Objects back to back:// <Soundslike object 1><Soundslike object 2> ...// There is no delimiter between the objects////// Format of the *.wrd files//// (This part is in ascii format)// <"master_wl"><ws><lang name><ws><# words><ws>// <hash size><ws><size of list block><\n>// (The rest is in binary format>// <Wordlist>// <Word Hash Table>//// The word hash table is a vector of unsigned its which contains an offset// of where they can be found in the word list.//// Format of the *.sl files//// (This part is in ascii format)// <"master_wl"><ws><lang name><ws><# words><ws>// <hash size><ws><size of list block><\n>// (The rest is in binary format>// <Soundslike object list>// <Soundslike Hash Table>//// Soundslike oject is laid out as follows:// <Num of Words><Word 1 offset>...<Soundslike><\0>// <unsigned short int><unsigned int>...<char[]><char>// And like the .wrd file the hash table contains offsets not pointers.//#include <vector>using std::vector;using std::pair;#include <string.h>#include <stdio.h>//#include <errno.h>#include "settings.h"#include "fstream.hpp"#include "vector_hash-t.hpp"#include "block_vector.hpp"#include "data.hpp"#include "file_util.hpp"#include "data_util.hpp"#include "language.hpp"#include "config.hpp"#include "string_buffer.hpp"#include "errors.hpp"typedef unsigned int u32int;static const u32int u32int_max = (u32int)-1;typedef unsigned short u16int;#ifdef HAVE_MMAP // POSIX headers#include <fcntl.h>#include <unistd.h>#include <sys/mman.h>#endif#ifndef MAP_FAILED #define MAP_FAILED (-1)#endif#ifdef HAVE_MMAPstatic inline char * mmap_open(unsigned int block_size, FStream & f, unsigned int offset) { f.flush(); int fd = f.file_no(); return static_cast<char *> (mmap(NULL, block_size, PROT_READ, MAP_SHARED, fd, offset));}static inline void mmap_free(char * block, unsigned int size) { munmap(block, size);}static inline size_t page_size() {#ifdef _SC_PAGESIZE /* BSDi does not expose this limit via the sysconf function */ return sysconf (_SC_PAGESIZE);#else return getpagesize ();#endif}#elsestatic inline char * mmap_open(unsigned int, FStream & f, unsigned int) { return reinterpret_cast<char *>(MAP_FAILED);}static inline void mmap_free(char *, unsigned int) { abort();}static inline size_t page_size() { return 1024;}#endifnamespace aspeller_default_readonly_ws { using namespace aspeller; ///////////////////////////////////////////////////////////////////// // // ReadOnlyWS // class ReadOnlyWS : public BasicWordSet { public: //but don't use struct WordLookupParms { const char * block_begin; WordLookupParms() {} //WordLookupParms(const char * b, const Language * l) // : block_begin(b), hash(l), equal(l) {} typedef BlockVector<const u32int> Vector; typedef u32int Value; typedef const char * Key; static const bool is_multi = true; Key key(Value v) const {assert (v != u32int_max); return block_begin + v;} InsensitiveHash hash; InsensitiveEqual equal; bool is_nonexistent(Value v) const {return v == u32int_max;} void make_nonexistent(const Value & v) const {abort();} }; typedef VectorHashTable<WordLookupParms> WordLookup; struct SoundslikeLookupParms { const char * block_begin; SoundslikeLookupParms() {} SoundslikeLookupParms(const char * b) : block_begin(b) {} typedef BlockVector<const u32int> Vector; typedef u32int Value; typedef const char * Key; static const bool is_multi = false; Key key(Value v) const {return block_begin + v;} hash<const char *> hash; bool equal(Key rhs, Key lhs) const {return strcmp(rhs,lhs) == 0;} bool is_nonexistent(Value v) const {return v == u32int_max;} void make_nonexistent(const Value & v) const {abort();} }; typedef VectorHashTable<SoundslikeLookupParms> SoundslikeLookup; private: char * block; u32int block_size; bool block_mmaped; WordLookup word_lookup; const char * word_block; u32int max_word_length; bool use_soundslike; SoundslikeLookup soundslike_lookup; const char * soundslike_block; ReadOnlyWS(const ReadOnlyWS&); ReadOnlyWS& operator= (const ReadOnlyWS&); struct ElementsParms; struct SoundslikeElementsParms; struct SoundslikeWordsParms; struct SoundslikeElementsParmsNoSL; struct SoundslikeWordsParmsNoSL; struct SoundslikeWordsEmulSingle; public: VirEmul * detailed_elements() const; Size size() const; bool empty() const; ReadOnlyWS() { block = 0; } ~ReadOnlyWS() { if (block != 0) { if (block_mmaped) mmap_free(block, block_size); else delete[] block; } } PosibErr<void> load(ParmString, Config *, SpellerImpl *, const LocalWordSetInfo *); BasicWordInfo lookup (ParmString word, const SensitiveCompare &) const; VirEmul * words_w_soundslike(const char * soundslike) const; VirEmul * words_w_soundslike(SoundslikeWord soundslike) const; VirSoundslikeEmul * soundslike_elements() const; }; // // // struct ReadOnlyWS::ElementsParms { typedef BasicWordInfo Value; typedef WordLookup::const_iterator Iterator; const char * word_block_begin; ElementsParms(const char * b) : word_block_begin(b) {} bool endf(const Iterator & i) const {return i.at_end();} Value end_state() const {return 0;} Value deref(const Iterator & i) const { return Value(word_block_begin + *i, *(word_block_begin + *i - 1)); } }; ReadOnlyWS::VirEmul * ReadOnlyWS::detailed_elements() const { return new MakeVirEnumeration<ElementsParms> (word_lookup.begin(), ElementsParms(block)); } ReadOnlyWS::Size ReadOnlyWS::size() const { return word_lookup.size(); } bool ReadOnlyWS::empty() const { return word_lookup.empty(); } struct DataHead { // all sizes except the last four must to divisible by // page_size() char check_word[64]; u32int head_size; u32int total_block_size; u32int word_block_size; u32int word_count; u32int word_buckets; u32int word_size; u32int max_word_length; u32int soundslike_block_size; u32int soundslike_count; u32int soundslike_buckets; u32int soundslike_size; u32int lang_name_size; u32int soundslike_name_size; u32int soundslike_version_size; u32int minimal_specified; u32int middle_chars_size; }; PosibErr<void> ReadOnlyWS::load(ParmString f0, Config * config, SpellerImpl *, const LocalWordSetInfo *) { set_file_name(f0); const char * fn = file_name(); FStream f; RET_ON_ERR(f.open(fn, "rb")); DataHead data_head; f.read(&data_head, sizeof(DataHead)); if (strcmp(data_head.check_word, "aspell default speller rowl 1.4") != 0) return make_err(bad_file_format, fn); char * word = new char[data_head.lang_name_size]; f.read(word, data_head.lang_name_size); PosibErr<void> pe = set_check_lang(word,config); if (pe.has_err()) return pe.with_file(fn); delete[] word; word = new char[data_head.soundslike_name_size]; f.read(word, data_head.soundslike_name_size); if (strcmp(word, lang()->soundslike_name()) != 0) return make_err(bad_file_format, fn, "Wrong Soundslike"); if (strcmp(word, "none") == 0) use_soundslike=false; else use_soundslike=true; delete[] word; word = new char[data_head.soundslike_version_size]; f.read(word, data_head.soundslike_version_size); if (strcmp(word, lang()->soundslike_version()) != 0) return make_err(bad_file_format, fn, "Wrong Soundslike Version"); delete[] word; if (data_head.minimal_specified != u32int_max) { word = new char[data_head.middle_chars_size]; f.read(word, data_head.middle_chars_size); if (strcmp(word, lang()->mid_chars()) != 0) return make_err(bad_file_format, fn, "Different Middle Characters"); delete[] word; if (data_head.minimal_specified != u32int_max) { config->replace("run-together-specified", "true"); unsigned int m = config->retrieve_int("minimal-specified-component"); if (data_head.minimal_specified < m) { char buf[20]; sprintf(buf, "%i", data_head.minimal_specified); config->replace("minimal-specified-component", buf); } } } block_size = data_head.total_block_size; block = mmap_open(block_size, f, data_head.head_size); block_mmaped = block != (char *)MAP_FAILED; if (!block_mmaped) { block = new char[block_size]; f.seek(data_head.head_size, SEEK_SET); f.read(block, block_size); } word_block = block; word_lookup.parms().block_begin = word_block; word_lookup.parms().hash .lang = lang(); word_lookup.parms().equal.lang = lang(); const u32int * begin = reinterpret_cast<const u32int *> (word_block + data_head.word_block_size); word_lookup.vector().set(begin, begin + data_head.word_buckets); word_lookup.set_size(data_head.word_count); max_word_length = data_head.max_word_length; if (use_soundslike) { soundslike_block = block + data_head.word_block_size + data_head.word_size; soundslike_lookup.parms().block_begin = soundslike_block; begin = reinterpret_cast<const u32int *> (soundslike_block + data_head.soundslike_block_size); soundslike_lookup.vector().set(begin, begin + data_head.soundslike_buckets); soundslike_lookup.set_size(data_head.soundslike_count); } return no_err; } BasicWordInfo ReadOnlyWS::lookup(ParmString word, const SensitiveCompare & c) const { WordLookup::ConstFindIterator i = word_lookup.multi_find(word); for (; !i.at_end(); i.adv()) { const char * w = word_block + i.deref(); if (c(word, w)) return BasicWordInfo(w,*(w-1)); } return 0; } struct ReadOnlyWS::SoundslikeWordsParms { typedef BasicWordInfo Value; typedef const u32int * Iterator; const char * word_block_begin; Iterator end; SoundslikeWordsParms(const char * b, Iterator e) : word_block_begin(b), end(e) {} bool endf(Iterator i) const {return i == end;} Value end_state() const {return 0;} Value deref(Iterator i) const { return Value(word_block_begin + *i, *(word_block_begin + *i - 1)); } }; struct ReadOnlyWS::SoundslikeElementsParms { typedef SoundslikeWord Value; typedef SoundslikeLookup::const_iterator Iterator; const char * soundslike_block_begin; SoundslikeElementsParms(const char * b) : soundslike_block_begin(b) {} bool endf(Iterator i) const {return i.at_end();} Value deref(Iterator i) { return Value(soundslike_block_begin + *i, 0); } Value end_state() {return Value(0,0);} }; struct ReadOnlyWS::SoundslikeElementsParmsNoSL { typedef SoundslikeWord Value; typedef WordLookup::const_iterator Iterator; const char * word_block_begin; const Language * lang; vector<char> buf; SoundslikeElementsParmsNoSL(u32int max_len, const char * b, const Language * l) : word_block_begin(b), lang(l) { buf.resize(max_len + 1); } bool endf(const Iterator & i) const {return i.at_end();} Value end_state() const {return Value(0,0);} Value deref(const Iterator & i) { Value v(&*buf.begin(), word_block_begin + *i); const char * w = static_cast<const char *>(v.word_list_pointer); int j = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -