📄 darts.h
字号:
/* Darts -- Double-ARray Trie System $Id: darts.h 1528 2006-08-07 02:39:50Z taku $; Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org> All rights reserved. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or(at your option) any later verjsion. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.*/#ifndef _DARTS_H_#define _DARTS_H_#define DARTS_VERSION "0.3"#include <vector>#include <cstring>#include <cstdio>#ifdef HAVE_ZLIB_Hnamespace zlib {#include <zlib.h>}#define SH(p)((unsigned short)(unsigned char)((p)[0]) |((unsigned short)(unsigned char)((p)[1]) << 8))#define LG(p)((unsigned long)(SH(p)) |((unsigned long)(SH((p)+2)) << 16))#endifnamespace Darts { template <class T> inline T _max(T x, T y) { return(x > y) ? x : y; } template <class T> inline T* _resize(T* ptr, size_t n, size_t l, T v) { T *tmp = new T [l]; for (size_t i = 0; i < n; ++i) tmp[i] = ptr[i]; for (size_t i = n; i < l; ++i) tmp[i] = v; delete [] ptr; return tmp; } template <class T> class Length { public: size_t operator()(const T *key) const { size_t i; for (i = 0; key[i] != (T)0; ++i) {}; return i; } }; template <> class Length<char> { public: size_t operator()(const char *key) const { return std::strlen(key); }; }; template <class node_type_, class node_u_type_, class array_type_, class array_u_type_, class length_func_ = Length<node_type_> > class DoubleArrayImpl { private: struct node_t { array_u_type_ code; size_t depth; size_t left; size_t right; }; struct unit_t { array_type_ base; array_u_type_ check; }; unit_t *array_; unsigned char *used_; size_t size_; size_t alloc_size_; node_type_ **key_; size_t key_size_; size_t *length_; array_type_ *value_; size_t progress_; size_t next_check_pos_; bool no_delete_; int error_; int(*progress_func_)(size_t, size_t); size_t resize(const size_t new_size) { unit_t tmp; tmp.base = 0; tmp.check = 0; array_ = _resize(array_, alloc_size_, new_size, tmp); used_ = _resize(used_, alloc_size_, new_size,(unsigned char)0); alloc_size_ = new_size; return new_size; } size_t fetch(const node_t &parent, std::vector <node_t> &siblings) { if (error_ < 0) return 0; array_u_type_ prev = 0; for (size_t i = parent.left; i < parent.right; ++i) { if ((length_ ? length_[i] : length_func_()(key_[i])) < parent.depth) continue; const node_u_type_ *tmp =(node_u_type_ *)(key_[i]); array_u_type_ cur = 0; if ((length_ ? length_[i] : length_func_()(key_[i])) != parent.depth) cur =(array_u_type_)tmp[parent.depth] + 1; if (prev > cur) { error_ = -3; return 0; } if (cur != prev || siblings.empty()) { node_t tmp_node; tmp_node.depth = parent.depth + 1; tmp_node.code = cur; tmp_node.left = i; if (! siblings.empty()) siblings[siblings.size()-1].right = i; siblings.push_back(tmp_node); } prev = cur; } if (! siblings.empty()) siblings[siblings.size()-1].right = parent.right; return siblings.size(); } size_t insert(const std::vector <node_t> &siblings) { if (error_ < 0) return 0; size_t begin = 0; size_t pos = _max((size_t)siblings[0].code + 1, next_check_pos_) - 1; size_t nonzero_num = 0; int first = 0; if (alloc_size_ <= pos) resize (pos + 1); while (true) { next: ++pos; if (alloc_size_ <= pos) resize (pos + 1); if (array_[pos].check) { ++nonzero_num; continue; } else if (! first) { next_check_pos_ = pos; first = 1; } begin = pos - siblings[0].code; if (alloc_size_ <= (begin + siblings[siblings.size()-1].code)) resize((size_t)(alloc_size_ * _max(1.05, 1.0 * key_size_ / progress_))); if (used_[begin]) continue; for (size_t i = 1; i < siblings.size(); ++i) if (array_[begin + siblings[i].code].check != 0) goto next; break; } // -- Simple heuristics -- // if the percentage of non-empty contents in check between the index // 'next_check_pos' and 'check' is greater than some constant value(e.g. 0.9), // new 'next_check_pos' index is written by 'check'. if (1.0 * nonzero_num/(pos - next_check_pos_ + 1) >= 0.95) next_check_pos_ = pos; used_[begin] = 1; size_ = _max(size_, begin + (size_t)siblings[siblings.size()-1].code + 1); for (size_t i = 0; i < siblings.size(); ++i) array_[begin + siblings[i].code].check = begin; for (size_t i = 0; i < siblings.size(); ++i) { std::vector <node_t> new_siblings; if (! fetch(siblings[i], new_siblings)) { array_[begin + siblings[i].code].base = value_ ?(array_type_)(-value_[siblings[i].left]-1) :(array_type_)(-siblings[i].left-1); if (value_ && (array_type_)(-value_[siblings[i].left]-1) >= 0) { error_ = -2; return 0; } ++progress_; if (progress_func_) (*progress_func_) (progress_, key_size_); } else { size_t h = insert(new_siblings); array_[begin + siblings[i].code].base = h; } } return begin; } public: typedef array_type_ value_type; typedef node_type_ key_type; typedef array_type_ result_type; // for compatibility struct result_pair_type { value_type value; size_t length; }; explicit DoubleArrayImpl(): array_(0), used_(0), size_(0), alloc_size_(0), no_delete_(0), error_(0) {} ~DoubleArrayImpl() { clear(); } void set_result(value_type& x, value_type r, size_t) { x = r; } void set_result(result_pair_type& x, value_type r, size_t l) { x.value = r; x.length = l; } void set_array(void *ptr, size_t size = 0) { clear(); array_ =(unit_t *)ptr; no_delete_ = true; size_ = size; } void *array() { return(void *)array_; } void clear() { if (! no_delete_) delete [] array_; delete [] used_; array_ = 0; used_ = 0; alloc_size_ = 0; size_ = 0; no_delete_ = false; } size_t unit_size() const { return sizeof(unit_t); }; size_t size() const { return size_; }; size_t total_size() const { return size_ * sizeof(unit_t); }; size_t nonzero_size() const { size_t result = 0; for (size_t i = 0; i < size_; ++i) if (array_[i].check) ++result; return result; } int build(size_t key_size, key_type **key, size_t *length = 0, value_type *value = 0, int(*progress_func)(size_t, size_t) = 0) { if (! key_size || ! key) return 0; progress_func_ = progress_func; key_ = key; length_ = length; key_size_ = key_size; value_ = value; progress_ = 0; resize(8192); array_[0].base = 1; next_check_pos_ = 0; node_t root_node; root_node.left = 0; root_node.right = key_size; root_node.depth = 0; std::vector <node_t> siblings; fetch(root_node, siblings); insert(siblings); size_ +=(1 << 8 * sizeof(key_type)) + 1; if (size_ >= alloc_size_) resize (size_); delete [] used_; used_ = 0; return error_; } int open(const char *file, const char *mode = "rb", size_t offset = 0, size_t size = 0) { std::FILE *fp = std::fopen(file, mode); if (! fp) return -1; if (std::fseek (fp, offset, SEEK_SET) != 0) return -1; if (! size) { if (std::fseek (fp, 0L, SEEK_END) != 0) return -1; size = std::ftell(fp); if (std::fseek (fp, offset, SEEK_SET) != 0) return -1; } clear(); size_ = size; size_ /= sizeof(unit_t); array_ = new unit_t [size_]; if (size_ != std::fread ((unit_t *)array_, sizeof(unit_t), size_, fp)) return -1; std::fclose(fp); return 0; } int save(const char *file, const char *mode = "wb", size_t offset = 0) { if (! size_) return -1; std::FILE *fp = std::fopen(file, mode); if (! fp) return -1; if (size_ != std::fwrite((unit_t *)array_, sizeof(unit_t), size_, fp)) return -1; std::fclose(fp); return 0; }#ifdef HAVE_ZLIB_H int gzopen(const char *file, const char *mode = "rb", size_t offset = 0, size_t size = 0) { std::FILE *fp = std::fopen(file, mode); if (! fp) return -1; clear(); size_ = size; if (! size_) { if (-1L != (long)std::fseek (fp, (-8), SEEK_END)) { char buf [8]; if (std::fread ((char*)buf, 1, 8, fp) != sizeof(buf)) { std::fclose(fp); return -1; } size_ = LG(buf+4); size_ /= sizeof(unit_t); } } std::fclose(fp); if (! size_) return -1; zlib::gzFile gzfp = zlib::gzopen(file, mode); if (! gzfp) return -1; array_ = new unit_t [size_]; if (zlib::gzseek (gzfp, offset, SEEK_SET) != 0) return -1; zlib::gzread(gzfp,(unit_t *)array_, sizeof(unit_t) * size_); zlib::gzclose(gzfp); return 0; } int gzsave(const char *file, const char *mode = "wb", size_t offset = 0) { zlib::gzFile gzfp = zlib::gzopen(file, mode); if (!gzfp) return -1; zlib::gzwrite(gzfp,(unit_t *)array_, sizeof(unit_t) * size_); zlib::gzclose(gzfp); return 0; }#endif template <class T> inline void exactMatchSearch(const key_type *key, T & result, size_t len = 0, size_t node_pos = 0) { result = exactMatchSearch<T>(key, len, node_pos); return; } template <class T> inline T exactMatchSearch(const key_type *key, size_t len = 0, size_t node_pos = 0) { if (! len) len = length_func_() (key); T result; set_result(result, -1, 0); register array_type_ b = array_[node_pos].base; register array_u_type_ p; for (register size_t i = 0; i < len; ++i) { p = b + (node_u_type_)(key[i]) + 1; if ((array_u_type_)b == array_[p].check) b = array_[p].base; else return result; } p = b; array_type_ n = array_[p].base; if ((array_u_type_)b == array_[p].check && n < 0) { set_result(result, -n-1, len); } return result; } template <class T> size_t commonPrefixSearch(const key_type *key, T* result, size_t result_len, size_t len = 0, size_t node_pos = 0) { if (! len) len = length_func_() (key); register array_type_ b = array_[node_pos].base; register size_t num = 0; register array_type_ n; register array_u_type_ p; for (register size_t i = 0; i < len; ++i) { p = b; // + 0; n = array_[p].base; if ((array_u_type_) b == array_[p].check && n < 0) { if (num < result_len) set_result (result[num], -n-1, i); // result[num] = -n-1; ++num; } p = b + (node_u_type_)(key[i]) + 1; if ((array_u_type_) b == array_[p].check) b = array_[p].base; else return num; } p = b; n = array_[p].base; if ((array_u_type_)b == array_[p].check && n < 0) { if (num < result_len) set_result(result[num], -n-1, len); ++num; } return num; } value_type traverse(const key_type *key, size_t &node_pos, size_t &key_pos, size_t len = 0) { if (! len) len = length_func_() (key); register array_type_ b = array_[node_pos].base; register array_u_type_ p; for (; key_pos < len; ++key_pos) { p = b + (node_u_type_)(key[key_pos]) + 1; if ((array_u_type_)b == array_[p].check) { node_pos = p; b = array_[p].base; } else return -2; // no node } p = b; array_type_ n = array_[p].base; if ((array_u_type_)b == array_[p].check && n < 0) return -n-1; return -1; // found, but no value } };#if 4 == 2 typedef Darts::DoubleArrayImpl<char, unsigned char, short, unsigned short> DoubleArray;#define DARTS_ARRAY_SIZE_IS_DEFINED 1#endif#if 4 == 4 && ! defined(DARTS_ARRAY_SIZE_IS_DEFINED) typedef Darts::DoubleArrayImpl<char, unsigned char, int, unsigned int> DoubleArray;#define DARTS_ARRAY_SIZE_IS_DEFINED 1#endif#if 4 == 4 && ! defined(DARTS_ARRAY_SIZE_IS_DEFINED) typedef Darts::DoubleArrayImpl<char, unsigned char, long, unsigned long> DoubleArray;#define DARTS_ARRAY_SIZE_IS_DEFINED 1#endif#if 4 == 8 && ! defined(DARTS_ARRAY_SIZE_IS_DEFINED) typedef Darts::DoubleArrayImpl<char, unsigned char, long long, unsigned long long> DoubleArray;#endif}#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -