📄 sphinx.rb

📁 sphinx0.9.8.2 中文搜索引擎版本用于跟mysql配合实现高效搜索引擎
💻 RB
字号:
# = sphinx.rb - Sphinx Client Library# # Author::    Dmytro Shteflyuk <mailto:kpumuk@kpumuk.info>.# Copyright:: Copyright (c) 2006 Wildbit, LLC# License::   Distributes under the same terms as Ruby# Version::   0.1.0# Website::   http://kpumuk.info/projects/ror-plugins/using-sphinx-search-engine-in-ruby-on-rails## This library is distributed under the terms of the Ruby license.# You can freely distribute/modify this library.# ==Sphinx Client Library # # The Sphinx Client Library is used to communicate with <tt>searchd</tt># daemon and get search results from Sphinx.# # ===Usage# #   sphinx = Sphinx.new#   result = sphinx.query('test')#   ids = result[:matches].map { |id, value| id }.join(',')#   posts = Post.find :all, :conditions => "id IN (#{ids})"#   #   docs = posts.map { |post| post.body }#   excerpts = sphinx.build_excerpts(docs, 'index', 'test')# class Sphinx  # :stopdoc:  class SphinxError < StandardError; end  class SphinxConnectError < SphinxError; end  class SphinxResponseError < SphinxError; end  class SphinxInternalError < SphinxError; end  class SphinxTemporaryError < SphinxError; end  class SphinxUnknownError < SphinxError; end  # Known searchd commands  # search command  SEARCHD_COMMAND_SEARCH  = 0  # excerpt command  SEARCHD_COMMAND_EXCERPT = 1  # Current client-side command implementation versions    # search command version  VER_COMMAND_SEARCH  = 0x104  # excerpt command version  VER_COMMAND_EXCERPT = 0x100    # Known searchd status codes  # match all query words  SEARCHD_OK    = 0  # match all query words  SEARCHD_ERROR = 1  # match all query words  SEARCHD_RETRY = 2    # :startdoc:  # Known match modes  # match all query words  SPH_MATCH_ALL     = 0   # match any query word  SPH_MATCH_ANY     = 1   # match this exact phrase  SPH_MATCH_PHRASE  = 2   # match this boolean query  SPH_MATCH_BOOLEAN = 3   # match this extended query  SPH_MATCH_EXTENDED= 4    # Known sort modes  # sort by document relevance desc, then by date  SPH_SORT_RELEVANCE     = 0  # sort by document date desc, then by relevance desc  SPH_SORT_ATTR_DESC     = 1  # sort by document date asc, then by relevance desc  SPH_SORT_ATTR_ASC      = 2  # sort by time segments (hour/day/week/etc) desc, then by relevance desc  SPH_SORT_TIME_SEGMENTS = 3  # sort by SQL-like expression (eg. "@weight DESC my_attr ASC")  SPH_SORT_EXTENDED      = 4  # Known attribute types  # this attr is just an integer  SPH_ATTR_INTEGER   = 1  # this attr is a timestamp  SPH_ATTR_TIMESTAMP = 2     # Known grouping functions  # group by day  SPH_GROUPBY_DAY   = 0  # group by week  SPH_GROUPBY_WEEK  = 1   # group by month  SPH_GROUPBY_MONTH = 2   # group by year  SPH_GROUPBY_YEAR  = 3  # group by attribute value  SPH_GROUPBY_ATTR  = 4    # Constructs the Sphinx object and sets options to their default values.   def initialize    @host       = 'localhost'         # searchd host (default is "localhost")    @port       = 3312                # searchd port (default is 3312)    @offset     = 0                   # how much records to seek from result-set start (default is 0)    @limit      = 20                  # how much records to return from result-set starting at offset (default is 20)    @mode       = SPH_MATCH_ALL       # query matching mode (default is SPH_MATCH_ALL)    @weights    = []                  # per-field weights (default is 1 for all fields)    @sort       = SPH_SORT_RELEVANCE  # match sorting mode (default is SPH_SORT_RELEVANCE)    @sortby     = ''                  # attribute to sort by (defualt is "")    @min_id     = 0                   # min ID to match (default is 0)    @max_id     = 0xFFFFFFFF          # max ID to match (default is UINT_MAX)    @min        = {}                  # attribute name to min-value hash (for range filters)    @max        = {}                  # attribute name to max-value hash (for range filters)    @filter     = {}                  # attribute name to values set hash (for values-set filters)    @groupby    = ''                  # group-by attribute name    @groupfunc  = SPH_GROUPBY_DAY     # function to pre-process group-by attribute value with    @maxmatches = 1000                # max matches to retrieve      @error      = ''                  # last error message    @warning    = ''                  # last warning message  end  # Get last error message.  def last_error    @error  end    # Get last warning message.  def last_warning    @warning  end    # Set searchd server.  def set_server(host, port)    @host = host    @port = port  end   # Set match offset, count, and max number to retrieve.  def set_limits(offset, limit, max = 0)    @offset = offset    @limit = limit    @maxmatches = max if max > 0  end    # Set match mode.  def set_match_mode(mode)    @mode = mode  end    # Set sort mode.  def set_sort_mode(mode, sortby = '')    @sort = mode    @sortby = sortby  end    # Set per-field weights.  def set_weights(weights)    @weights = weights  end    # Set IDs range to match.  #   # Only match those records where document ID is beetwen <tt>min_id</tt> and <tt>max_id</tt>   # (including <tt>min_id</tt> and <tt>max_id</tt>).  def set_id_range(min_id, max_id)    @min_id = min_id    @max_id = max_id  end    # Set values filter.  #   # Only match those records where <tt>attr</tt> column values  # are in specified set.  def set_filter(attr, values)    @filter[attr] = values  end    # Set range filter.  #   # Only match those records where <tt>attr</tt> column value  # is beetwen <tt>min</tt> and <tt>max</tt> (including <tt>min</tt> and <tt>max</tt>).  def set_filter_range(attr, min, max)    @min[attr] = min    @max[attr] = max  end    # Set grouping.  #   # if grouping  def set_group_by(attr, func)    @groupby = attr    @groupfunc = func  end    # Connect to searchd server and run given search query.  #  # * <tt>query</tt> -- query string  # * <tt>index</tt> -- index name to query, default is "*" which means to query all indexes  #  # returns hash which has the following keys on success:  #   # * <tt>:matches</tt> -- hash which maps found document_id to ( "weight", "group" ) hash  # * <tt>:total</tt> -- total amount of matches retrieved (upto SPH_MAX_MATCHES, see sphinx.h)  # * <tt>:total_found</tt> -- total amount of matching documents in index  # * <tt>:time</tt> -- search time  # * <tt>:words</tt> -- hash which maps query terms (stemmed!) to ( :docs, :hits ) hash  def query(query, index = '*')    sock = connect        # build request    # mode and limits    req = [@offset, @limit, @mode, @sort].pack('NNNN')    req << [@sortby.length].pack('N')    req << @sortby    # query itself    req << [query.length].pack('N')    req << query    # weights    req << [@weights.length].pack('N')    req << @weights.pack('N' * @weights.length)    # indexes    req << [index.length].pack('N')    req << index    # id range    req << [@min_id.to_i, @max_id.to_i].pack('NN')        # filters    req << [@min.length + @filter.length].pack('N')    @min.each do |attribute, min|      req << [attribute.length].pack('N')      req << attribute      req << [0, min, @max[attribute]].pack('NNN')    end        @filter.each do |attribute, values|      req << [attribute.length].pack('N')      req << attribute      req << [values.length].pack('N')      req << values.pack('N' * values.length)    end        # group-by    req << [@groupfunc, @groupby.length].pack('NN')    req << @groupby        # max matches to retrieve    req << [@maxmatches].pack('N')        # send query, get response    len = req.length    # add header    req = [SEARCHD_COMMAND_SEARCH, VER_COMMAND_SEARCH, len].pack('nnN') + req    sock.send(req, 0)        response = get_response(sock, VER_COMMAND_SEARCH)        # parse response    result = {}    max = response.length # protection from broken response    #read schema    p = 0    fields = []    attrs = {}        nfields = response[p, 4].unpack('N*').first    p += 4    while nfields > 0 and p < max      nfields -= 1      len = response[p, 4].unpack('N*').first      p += 4      fields << response[p, len]      p += len    end    result[:fields] = fields    nattrs = response[p, 4].unpack('N*').first    p += 4    while nattrs > 0 && p < max      nattrs -= 1      len = response[p, 4].unpack('N*').first      p += 4      attr = response[p, len]      p += len      type = response[p, 4].unpack('N*').first      p += 4      attrs[attr.to_sym] = type;    end    result[:attrs] = attrs        # read match count    count = response[p, 4].unpack('N*').first    p += 4        # read matches    result[:matches] = {}    while count > 0 and p < max      count -= 1      doc, weight = response[p, 8].unpack('N*N*')      p += 8      result[:matches][doc] ||= {}      result[:matches][doc][:weight] = weight      attrs.each do |attr, type|        val = response[p, 4].unpack('N*').first        p += 4        result[:matches][doc][:attrs] ||= {}        result[:matches][doc][:attrs][attr] = val      end    end    result[:total], result[:total_found], result[:time], words = \      response[p, 16].unpack('N*N*N*N*')    result[:time] = '%.3f' % (result[:time] / 1000)    p += 16        result[:words] = {}    while words > 0 and p < max      words -= 1      len = response[p, 4].unpack('N*').first      p += 4      word = response[p, len]      p += len      docs, hits = response[p, 8].unpack('N*N*')      p += 8      result[:words][word] = {:docs => docs, :hits => hits}    end        result  end  # Connect to searchd server and generate exceprts from given documents.  #  # * <tt>index</tt> -- a string specifiying the index which settings will be used  # for stemming, lexing and case folding  # * <tt>docs</tt> -- an array of strings which represent the documents' contents  # * <tt>words</tt> -- a string which contains the words to highlight  # * <tt>opts</tt> is a hash which contains additional optional highlighting parameters.  #   # You can use following parameters:  # * <tt>:before_match</tt> -- a string to insert before a set of matching words, default is "<b>"  # * <tt>:after_match</tt> -- a string to insert after a set of matching words, default is "<b>"  # * <tt>:chunk_separator</tt> -- a string to insert between excerpts chunks, default is " ... "  # * <tt>:limit</tt> -- max excerpt size in symbols (codepoints), default is 256  # * <tt>:around</tt> -- how much words to highlight around each match, default is 5  #  # Returns an array of string excerpts on success.  def build_excerpts(docs, index, words, opts = {})    sock = connect    # fixup options    opts[:before_match] ||= '<b>';    opts[:after_match] ||= '</b>';    opts[:chunk_separator] ||= ' ... ';    opts[:limit] ||= 256;    opts[:around] ||= 5;        # build request        # v.1.0 req    req = [0, 1].pack('N2'); # mode=0, flags=1 (remove spaces)    # req index    req << [index.length].pack('N')    req << index    # req words    req << [words.length].pack('N')    req << words    # options    req << [opts[:before_match].length].pack('N')    req << opts[:before_match]    req << [opts[:after_match].length].pack('N')    req << opts[:after_match]    req << [opts[:chunk_separator].length].pack('N')    req << opts[:chunk_separator]    req << [opts[:limit].to_i, opts[:around].to_i].pack('NN')        # documents    req << [docs.size].pack('N');    docs.each do |doc|      req << [doc.length].pack('N')      req << doc    end        # send query, get response    len = req.length    # add header    req = [SEARCHD_COMMAND_EXCERPT, VER_COMMAND_EXCERPT, len].pack('nnN') + req    sock.send(req, 0)        response = get_response(sock, VER_COMMAND_EXCERPT)        # parse response    p = 0    res = []    rlen = response.length    docs.each do |doc|      len = response[p, 4].unpack('N*').first;      p += 4      if p + len > rlen        @error = 'incomplete reply'        raise SphinxResponseError, @error      end      res << response[p, len]      p += len    end    return res;  end  # Connect to searchd server.  def connect    begin      sock = TCPSocket.new(@host, @port)    rescue      @error = "connection to #{@host}:#{@port} failed"      raise SphinxConnectError, @error    end        v = sock.recv(4).unpack('N*').first    if v < 1      sock.close      @error = "expected searchd protocol version 1+, got version '#{v}'"      raise SphinxConnectError, @error    end        sock.send([1].pack('N'), 0)    sock  end  private :connect    # get and check response packet from searchd server  def get_response(sock, client_version)    header = sock.recv(8)    status, ver, len = header.unpack('n2N')    response = ''    left = len    while left > 0 do      begin        chunk = sock.recv(left)        if chunk          response << chunk          left -= chunk.length        end      rescue EOFError      end    end    sock.close    # check response    read = response.length    if not response or read != len      @error = len \        ? "failed to read searchd response (status=#{status}, ver=#{ver}, len=#{len}, read=#{read})" \        : "received zero-sized searchd response"      raise SphinxResponseError, @error    end        # check status    if status == SEARCHD_ERROR      @error = "searchd error: " + response[4,].to_s      raise SphinxInternalError, @error    end    if status == SEARCHD_RETRY      @error = "temporary searchd error: " + response[4,]      raise SphinxTemporaryError, @error    end    unless status == SEARCHD_OK      @error = "unknown status code '#{status}'"      raise SphinxUnknownError, @error    end        # check version    if ver < client_version      @warning = "searchd command v.%d.%d older than client's v.%d.%d, some options might not work" % \        ver >> 8, ver & 0xff, client_ver >> 8, client_ver & 0xff    end        return response  end  private :get_response end
💿 文件大小 626 K
👤 上传用户 GUAIGUAICHENGTI
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#sphinx #mysql #搜索引擎 #版本
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -