readonly_ws.cpp

来自「unix/linux下拼写检查程序源码」· C++ 代码 · 共 929 行 · 第 1/2 页
CPP
929 行
// This file is part of The New Aspell// Copyright (C) 2000-2001 by Kevin Atkinson under the GNU LGPL// license version 2.0 or 2.1.  You should have received a copy of the// LGPL license along with this library if you did not you can find it// at http://www.gnu.org/.// Aspell's main word list data is stored in 4 large blocks of memory//// * The Word Hash Table// * The Word List// * The Soundslike Hash Table// * The Soundslike List//// 1a) The Word Hash Table// This consists of an open address hash table which contains pointers// to the actual words in the word list//// 1b) The Word List// This consists of the actual word list and is layed out as follows://   <Word1><null char><Word2><null char>...//// 2a) The Soundslike Hash Table// This consists of an open address hash table which contains pointers// to a soundslike object.//// 2b) The Soundslike Object// The soundslike object is layed out as follow://  What:  <Word1 pointer><Word2 p.>...<Num of Words><Soundslike><null char>//  Types: <const char *><const char *>...<unsigned short int><char[]><char>//         <unsigned int><unsigned int>...<unsigned short int><char[]><char>// The pointer to the object points to the beginning of the Soundslike string// The Word pointers consists of the the words which have the same //   soundslike pattern//// 2c) The Soundslike List// This consists of Soundslike Objects back to back://  <Soundslike object 1><Soundslike object 2> ...// There is no delimiter between the objects//////                          Format of the *.wrd files//// (This part is in ascii format)// <"master_wl"><ws><lang name><ws><# words><ws>//     <hash size><ws><size of list block><\n>// (The rest is in binary format>// <Wordlist>// <Word Hash Table>//// The word hash table is a vector of unsigned its which contains an offset// of where they can be found in the word list.////                          Format of the *.sl files//// (This part is in ascii format)// <"master_wl"><ws><lang name><ws><# words><ws>//     <hash size><ws><size of list block><\n>// (The rest is in binary format>// <Soundslike object list>// <Soundslike Hash Table>//// Soundslike oject is laid out as follows://   <Num of Words><Word 1 offset>...<Soundslike><\0>//   <unsigned short int><unsigned int>...<char[]><char>// And like the .wrd file the hash table contains offsets not pointers.//#include <vector>using std::vector;using std::pair;#include <string.h>#include <stdio.h>//#include <errno.h>#include "settings.h"#include "fstream.hpp"#include "vector_hash-t.hpp"#include "block_vector.hpp"#include "data.hpp"#include "file_util.hpp"#include "data_util.hpp"#include "language.hpp"#include "config.hpp"#include "string_buffer.hpp"#include "errors.hpp"typedef unsigned int   u32int;static const u32int u32int_max = (u32int)-1;typedef unsigned short u16int;#ifdef HAVE_MMAP // POSIX headers#include <fcntl.h>#include <unistd.h>#include <sys/mman.h>#endif#ifndef MAP_FAILED #define MAP_FAILED (-1)#endif#ifdef HAVE_MMAPstatic inline char * mmap_open(unsigned int block_size, 			       FStream & f, 			       unsigned int offset) {  f.flush();  int fd = f.file_no();  return static_cast<char *>    (mmap(NULL, block_size, PROT_READ, MAP_SHARED, fd, offset));}static inline void mmap_free(char * block, unsigned int size) {  munmap(block, size);}static inline size_t page_size() {#ifdef _SC_PAGESIZE /* BSDi does not expose this limit via the sysconf function */  return sysconf (_SC_PAGESIZE);#else  return getpagesize ();#endif}#elsestatic inline char * mmap_open(unsigned int, 			       FStream & f, 			       unsigned int) {  return reinterpret_cast<char *>(MAP_FAILED);}static inline void mmap_free(char *, unsigned int) {  abort();}static inline size_t page_size() {  return 1024;}#endifnamespace aspeller_default_readonly_ws {  using namespace aspeller;  /////////////////////////////////////////////////////////////////////  //   //  ReadOnlyWS  //      class ReadOnlyWS : public BasicWordSet  {        public: //but don't use    struct WordLookupParms {      const char * block_begin;      WordLookupParms() {}      //WordLookupParms(const char * b, const Language * l)      //	: block_begin(b), hash(l), equal(l) {}      typedef BlockVector<const u32int> Vector;      typedef u32int                    Value;      typedef const char *              Key;      static const bool is_multi = true;      Key key(Value v) const {assert (v != u32int_max);				return block_begin + v;}      InsensitiveHash  hash;      InsensitiveEqual equal;      bool is_nonexistent(Value v) const {return v == u32int_max;}      void make_nonexistent(const Value & v) const {abort();}    };    typedef VectorHashTable<WordLookupParms> WordLookup;        struct SoundslikeLookupParms {      const char * block_begin;      SoundslikeLookupParms() {}      SoundslikeLookupParms(const char * b) : block_begin(b) {}      typedef BlockVector<const u32int> Vector;      typedef u32int                    Value;      typedef const char *              Key;      static const bool is_multi = false;      Key key(Value v) const {return block_begin + v;}      hash<const char *> hash;      bool equal(Key rhs, Key lhs) const {return strcmp(rhs,lhs) == 0;}      bool is_nonexistent(Value v) const {return v == u32int_max;}      void make_nonexistent(const Value & v) const {abort();}    };    typedef VectorHashTable<SoundslikeLookupParms> SoundslikeLookup;  private:          char *           block;    u32int           block_size;    bool             block_mmaped;    WordLookup       word_lookup;    const char *     word_block;    u32int           max_word_length;    bool             use_soundslike;    SoundslikeLookup soundslike_lookup;    const char *     soundslike_block;        ReadOnlyWS(const ReadOnlyWS&);    ReadOnlyWS& operator= (const ReadOnlyWS&);    struct ElementsParms;    struct SoundslikeElementsParms;    struct SoundslikeWordsParms;    struct SoundslikeElementsParmsNoSL;    struct SoundslikeWordsParmsNoSL;    struct SoundslikeWordsEmulSingle;  public:    VirEmul * detailed_elements() const;    Size      size()     const;    bool      empty()    const;          ReadOnlyWS() {      block = 0;    }    ~ReadOnlyWS() {      if (block != 0) {	if (block_mmaped)	  mmap_free(block, block_size);	else	  delete[] block;      }    }          PosibErr<void> load(ParmString, Config *, SpellerImpl *, const LocalWordSetInfo *);    BasicWordInfo lookup (ParmString word, const SensitiveCompare &) const;    VirEmul * words_w_soundslike(const char * soundslike) const;    VirEmul * words_w_soundslike(SoundslikeWord soundslike) const;    VirSoundslikeEmul * soundslike_elements() const;  };      //  //    //  struct ReadOnlyWS::ElementsParms {    typedef BasicWordInfo                   Value;    typedef WordLookup::const_iterator Iterator;     const char * word_block_begin;    ElementsParms(const char * b) : word_block_begin(b) {}    bool endf(const Iterator & i) const {return i.at_end();}    Value end_state() const {return 0;}    Value deref(const Iterator & i) const {      return Value(word_block_begin + *i, *(word_block_begin + *i - 1));    }  };  ReadOnlyWS::VirEmul * ReadOnlyWS::detailed_elements() const {    return new MakeVirEnumeration<ElementsParms>      (word_lookup.begin(), ElementsParms(block));  }  ReadOnlyWS::Size ReadOnlyWS::size() const {    return word_lookup.size();  }    bool ReadOnlyWS::empty() const {    return word_lookup.empty();  }  struct DataHead {    // all sizes except the last four must to divisible by    // page_size()    char check_word[64];    u32int head_size;    u32int total_block_size;    u32int word_block_size;    u32int word_count;    u32int word_buckets;    u32int word_size;    u32int max_word_length;    u32int soundslike_block_size;    u32int soundslike_count;    u32int soundslike_buckets;    u32int soundslike_size;    u32int lang_name_size;    u32int soundslike_name_size;    u32int soundslike_version_size;    u32int minimal_specified;    u32int middle_chars_size;  };  PosibErr<void> ReadOnlyWS::load(ParmString f0, Config * config, 				  SpellerImpl *, const LocalWordSetInfo *)  {    set_file_name(f0);    const char * fn = file_name();    FStream f;    RET_ON_ERR(f.open(fn, "rb"));    DataHead data_head;    f.read(&data_head, sizeof(DataHead));    if (strcmp(data_head.check_word, "aspell default speller rowl 1.4") != 0)      return make_err(bad_file_format, fn);    char * word = new char[data_head.lang_name_size];    f.read(word, data_head.lang_name_size);    PosibErr<void> pe = set_check_lang(word,config);    if (pe.has_err())      return pe.with_file(fn);        delete[] word;    word = new char[data_head.soundslike_name_size];    f.read(word, data_head.soundslike_name_size);    if (strcmp(word, lang()->soundslike_name()) != 0)      return make_err(bad_file_format, fn, "Wrong Soundslike");    if (strcmp(word, "none") == 0)      use_soundslike=false;    else      use_soundslike=true;    delete[] word;    word = new char[data_head.soundslike_version_size];    f.read(word, data_head.soundslike_version_size);    if (strcmp(word, lang()->soundslike_version()) != 0)      return make_err(bad_file_format, fn, "Wrong Soundslike Version");    delete[] word;    if (data_head.minimal_specified != u32int_max) {      word = new char[data_head.middle_chars_size];      f.read(word, data_head.middle_chars_size);            if (strcmp(word, lang()->mid_chars()) != 0)	return make_err(bad_file_format, fn, "Different Middle Characters");            delete[] word;      if (data_head.minimal_specified != u32int_max) {	config->replace("run-together-specified", "true");	unsigned int m = config->retrieve_int("minimal-specified-component");	if (data_head.minimal_specified < m) {	  char buf[20];	  sprintf(buf, "%i", data_head.minimal_specified);	  config->replace("minimal-specified-component", buf);	}      }    }    block_size = data_head.total_block_size;    block = mmap_open(block_size, f, data_head.head_size);    block_mmaped = block != (char *)MAP_FAILED;    if (!block_mmaped) {      block = new char[block_size];      f.seek(data_head.head_size, SEEK_SET);      f.read(block, block_size);    }    word_block       = block;    word_lookup.parms().block_begin = word_block;    word_lookup.parms().hash .lang   = lang();    word_lookup.parms().equal.lang   = lang();    const u32int * begin = reinterpret_cast<const u32int *>      (word_block + data_head.word_block_size);    word_lookup.vector().set(begin, begin + data_head.word_buckets);    word_lookup.set_size(data_head.word_count);        max_word_length = data_head.max_word_length;        if (use_soundslike) {      soundslike_block = block + data_head.word_block_size + data_head.word_size;      soundslike_lookup.parms().block_begin = soundslike_block;      begin = reinterpret_cast<const u32int *>	(soundslike_block + data_head.soundslike_block_size);      soundslike_lookup.vector().set(begin,				     begin + data_head.soundslike_buckets);      soundslike_lookup.set_size(data_head.soundslike_count);    }    return no_err;  }  BasicWordInfo ReadOnlyWS::lookup(ParmString word, 				   const SensitiveCompare & c) const   {    WordLookup::ConstFindIterator i = word_lookup.multi_find(word);    for (; !i.at_end(); i.adv()) {      const char * w = word_block + i.deref();      if (c(word, w))	return BasicWordInfo(w,*(w-1));    }    return 0;  }  struct ReadOnlyWS::SoundslikeWordsParms {    typedef BasicWordInfo                   Value;    typedef const u32int *             Iterator;    const char * word_block_begin;    Iterator     end;    SoundslikeWordsParms(const char * b, Iterator e)       : word_block_begin(b), end(e) {}    bool endf(Iterator i) const {return i == end;}    Value end_state() const {return 0;}    Value deref(Iterator i) const {      return Value(word_block_begin + *i, *(word_block_begin + *i - 1));    }  };  struct ReadOnlyWS::SoundslikeElementsParms {    typedef SoundslikeWord                   Value;    typedef SoundslikeLookup::const_iterator Iterator;    const char * soundslike_block_begin;          SoundslikeElementsParms(const char * b)       : soundslike_block_begin(b) {}          bool endf(Iterator i) const {return i.at_end();}        Value deref(Iterator i) {      return Value(soundslike_block_begin + *i, 0);    }    Value end_state() {return Value(0,0);}  };  struct ReadOnlyWS::SoundslikeElementsParmsNoSL {    typedef SoundslikeWord              Value;    typedef WordLookup::const_iterator  Iterator;     const char * word_block_begin;    const Language * lang;    vector<char> buf;    SoundslikeElementsParmsNoSL(u32int max_len, const char * b, 				const Language * l)       : word_block_begin(b), lang(l)    {      buf.resize(max_len + 1);    }    bool endf(const Iterator & i) const {return i.at_end();}    Value end_state() const {return Value(0,0);}    Value deref(const Iterator & i)     {      Value v(&*buf.begin(), word_block_begin + *i);      const char * w = static_cast<const char *>(v.word_list_pointer);      int j = 0;
readonly_ws.cpp - 源码说明

本页面展示了「unix/linux下拼写检查程序源码」中的 readonly_ws.cpp 源码文件，采用 C++ 编程语言编写，共 929 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与linux相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?