📄 cdict.py

📁 在网上下载的程序
💻 PY
字号:
# -*- coding: cp936 -*-
# author:sunjoy
# email:ccnusjy@gmail.com

import bsddb,os
import re
class CDict:
    def __init__(self):
        #print os.path.split(__file__)[0]+'/data/dict.dat'
        self.d=bsddb.btopen('data/dict.dat','c')
    def __del__(self):
        self.d.close()
        
    def segWords(self,sentence):
        # 将字符的编码转换为utf-8编码
        try:
            sentence=sentence.decode('utf-8')
        except:
            return []
        # 将字符串中的一些诸如。，,等字符全都用空格来替代
        sentence=re.sub(u"[。，,！……!《》<>\"':：？\?、\|“”‘’；]"," ",sentence)
        print "sentence   :"+sentence
        
        # 进行一些下面将会用到的变量的初始化
        length=len(sentence)
        i=length
        result=[]

        # 
        while True:
            # 只有当i小于等于0的时候，while循环才会跳出
            if i<=0:
                break
            #
            found=-1
            tempi=i
            # tok 为第i-1个字符
            tok=sentence[i-1:i]

            # －－－－－－英文的情况
            # 如果tok为0-9A-Za-z\-\+#@_\.中的任何一个字符，则将i前移，以便找到一个不出现上述字符的位置，这些字符就都为英文字符，可以用英语单词的方法来切分
            while re.search("[0-9A-Za-z\-\+#@_\.]{1}",tok)<>None:
                i-=1
                tok=sentence[i-1:i]
                print "tok   "+tok
            # 如国tempi-i>0，即存在0-9A-Za-z\-\+#@_\.中的任何一个字符或他们组成的单词，则将单词保存值结果列表result中
            if tempi-i>0:
                result.append(sentence[i:tempi].lower().encode('utf-8'))


            # －－－－－－中文的情况
            for j in xrange(4,0,-1):    # j 分别等于4,3,2,1
                if i-j<0:
                    continue
                utf8Word=sentence[i-j:i].encode('utf-8')
                print "utf8Word raw   "+utf8Word
                if(self.d.has_key(utf8Word)):
                    found=i-j
                    result.append(utf8Word)
                    print "utf8Word result "+utf8Word
                    #print "the total result "
                    #for dddd in result:
                    #    print dddd
                    break

            if found==-1:
                if i<length and sentence[i].strip()=="":
                    result.append(sentence[i-1].encode('utf-8'))
                    print "the total result when found = -1 "
                    for dddd in result:
                        print dddd
                    print "-------------------"
                elif(sentence[i-1:i].strip()!=""):
                    if len(result)>0 and len(result[-1])<12:
                        result.append(sentence[i-1:i].encode('utf-8')+result[-1])
                        print """en(result)>0 and len(result[-1])<12:"""
                        for dddd in result:
                            print dddd
                        print "-------------------"
                    else:
                        result.append(sentence[i-1:i].encode('utf-8'))
                        print """len(result)>0 and len(result[-1])<12 is false"""
                        for dddd in result:
                            print dddd
                        print "-------------------"
                i-=1
            else:
                i=found
        print "the finally result is :-----------------------"
        for dddd in result:
            print dddd
        print "----------------------------------------------"
        goodR=[]
        for w in result:
            if w.strip()<>"":
                goodR.append(w)
        return goodR
    
    def segWords2(self,sentence):
        # 将字符的编码转换为utf-8编码
        try:
            sentence=sentence.decode('utf-8')
        except:
            return []
        # 进行一些下面将会用到的变量的初始化
        length=len(sentence)
        i=length
        result=[]
        
        while True:
            # 只有当i小于等于0的时候，while循环才会跳出
            if i<=0:
                break
            # 
            found=-1
            tempi=i
            # tok 为第i-1个字符
            tok=sentence[i-1:i]
            # 如果tok为0-9A-Za-z\-\+#@_\.中的任何一个字符，则将i前移，以便找到一个不出现上述字符的位置，这些字符就都为英文字符，
            # 即按照英文单词的切分方法来切分，这里使用循环是为了找出一个单词的开始位置（结束位置已经有了）
            while re.search("[0-9A-Za-z\-\+#@_\.]{1}",tok)<>None:
                i-=1
                tok=sentence[i-1:i]
               # print "一个个的单个字符为： "+tok
            # 有了一个单词的开始和结束位置以后，就可以确定这个单词是什么了，于是就可以将整个单词存入结果列表result，而不是一个个的字符
            if tempi-i>0:
                result.append(sentence[i:tempi].lower().encode('utf-8'))
                #print "将上述单个的字符连接在一起形成单词为："
                #print "ddd"
                #print result.decode('utf-8')
                print result[0:]
                iii=0
            # xrange里面的4表明是最大匹配4个字，比如"中华人民共和国"就不能完整匹配，把4改为大于等于7的整数就可以完整匹配了
            for j in xrange(4,0,-1):    # j 分别等于4,3,2,1 ，
                if i-j<0:
                    continue
                utf8Word=sentence[i-j:i].encode('utf-8')#这里的匹配方法为逆向最大匹配方法
                if(self.d.has_key(utf8Word)):
                    print "utf8Word result  "+str(iii)+"   "+utf8Word
                    iii=iii+1
                    found=i-j
                    result.append(utf8Word)
                    break

            #－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－－
            # 如果字典中不存在所要查找的词，则将其直接保存进结果列表result
            if found==-1:
                result.append(sentence[i-1:i].encode('utf-8'))
                print "found==-1  utf-8  :"+sentence[i-1:i].encode('utf-8')
                print "found==-1  :"+sentence[i-1:i]
                i-=1
            # 如果在字典中找到了所需要的单词，则重置i，词已经在前面保存进结果列表result
            else:
                i=found
            #－－－－－－－－－－－－－－－－－－－－－－－－－－－－----------------------
        # 将结果输出
        print "－－－－－－－－－－－"
        print result[0:]
        goodR=[]
        for w in result:
            if w.strip()<>"":
                goodR.append(w)
        return goodR



if __name__=="__main__":
    d=CDict()
    #words=d.segWords("""我爱北京天安门，我叫孙君意，我爱python and c++""".decode('gbk').encode('utf-8'))
    #print "==========保守模式============="
    #for w in words:
     #   print w.decode('utf-8')
        
    words=d.segWords2("""我爱北京天安门，我叫孙君意，我爱python and CAAA++ 我是张永伟中华人民共和国iwy what？""".decode('gbk').encode('utf-8'))
    #ssss="""我爱北京天安门，我叫孙君意，我爱python and CAAA++ 我是张永伟iwy what？""".decode('gbk').encode('utf-8')
    d=[]
    #for i in ssss:
    #    d[i]=ssss[i]
    print "==========冗余模式============="
    for w in words:
        print w.decode('utf-8')
💿 文件大小 2323 K
👤 上传用户 yeling023
📂 所属分类多国语言处理
📄 代码行数 186 行
💻 语言类型 Python
🏷️ 相关标签

#程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -