📄 wordbreak.py
字号:
# -*- coding: gbk -*-
#!/usr/bin/env python
#author onebird:
#gbk wordbreak
import copy
from sets import Set
class CWordBreak:
def __init__(self):
self.zDict={}
self.sentSpliters=[". ", "!" ,"?", "。","!","?"]
self.snipSpliters=[",", "," ,";", ";",":",":", ",",","]
# self.cutwdSet1=Set(["","","","","","","","","","","","","","","","","","","",""])
self.wordCutSet=Set([","," ",",","“","’",":","(",")","》","!","【","】","?",])
self.byteCutSet=Set([","," ","\t","'","\"",":","(",")",">","<","}","{","!","{","]","?"])
self.sectSpliters=["\n\n","\r\n\r\n","\n ","\n\t","\n "]
return
def LoadDictFile(self,strPath):
fDict = open(strPath)
lines = fDict.readlines()
lineN =0
for aLine in lines:
lineN+=1
aLine = aLine.rstrip("\n")
aLine = aLine.rstrip("\r")
wLen = len(aLine)
nPos=0
#refDict=weakref.ref(self.zDict)
refDict=self.zDict
while nPos<wLen:
#tDict={}
curZ = aLine[nPos:nPos+2]
if curZ not in refDict:
refDict[curZ]={}
refDict=refDict[curZ]
nPos+=2
# for k in self.zDict.iterkeys():
# print "*"+k
## for kk in self.zDict[k].iterkeys():
# print "* ---"+kk+"|"
# for kkk in self.zDict[k][kk].iterkeys():
# print "* ---"+kkk+"|"
def WordBreak(self,inStr):
outStr=""
#print "call WordBreak"
wds=[]
self.BreakWords(inStr,wds)
#print "wds_len::"+str(len(wds))
for wd in wds:
outStr+=wd
outStr+=" "
#print "//////////////////"
return outStr
def BreakWords(self,inStr,wds):
#print "call BreakToWrds"
#print "!"
sents=[]
self.TxtToSentence(inStr,sents)
snips=[]
for aSent in sents:
segs=[]
self.SplitX(aSent,segs,self.snipSpliters)
for aSeg in segs:
snips.append(aSeg)
#print len(snips)
for asnip in snips:
segs=[]
#print "snip::"+asnip
self.BreakToWords(asnip,segs)
#print len(segs)
for aSeg in segs:
wds.append(aSeg)
return
def BreakToWords(self,inStr,segs):
nPos = 0
nLen = len(inStr)
nLang=0 #0初始,1中文 2英文
wd=""
refDict=self.zDict
while nPos<nLen:
if ord(inStr[nPos])>0x80:
#print "cncn"
if 0x40<=ord(inStr[nPos]):
if nLang==2:
segs.append(wd)#英中文切换 取出一个英文单词
wd=""#一个新词开始
#print "wd::"+wd
nLang=1
zi=inStr[nPos:nPos+2]
if zi in self.wordCutSet:
if len(wd)!=0:
segs.append(wd)
wd = ""
#print "zi::"+zi
if zi in refDict:
wd+=zi
refDict=refDict[zi]
else:
if len(wd)!=0:
segs.append(wd) #此字不在后续子树中 则前面的wd为一个最长匹配词
#print "wd::"+wd
refDict=self.zDict
wd=""
if zi in refDict:
wd+=zi
refDict=refDict[zi]
else:
segs.append(zi)
nPos+=2
else:
nPos+=1
continue#越过腐坏字节
else:
if nLang==1 and len(wd)!=0 :
segs.append(wd)#中英文切换 取出一个中文词
wd=""
if (inStr[nPos] in self.byteCutSet ):
if len(wd)!=0:
segs.append(wd)
wd=""
segs.append(inStr[nPos])
else:
wd+=inStr[nPos]
nPos+=1
nLang=2
if len(wd)!=0:
segs.append(wd)
def TxtToSentence(self,inStr,segs):
Sects=[]
self.TxtToSection(inStr,Sects)
for aSect in Sects:
Sents=[]
self.SectionToSentence(aSect,Sents)
for aSent in Sents:
segs.append(aSent)
def TxtToSection(self,inStr,segs):
self.SplitX(inStr,segs,self.sectSpliters)
def SectionToSentence(self,inStr,segs):
self.SplitX(inStr,segs,self.sentSpliters)
def SplitX(self,inStr,segs,spliters):
inSegs=[]
inSegs.append(inStr)
for aSP in spliters:
outSegs=[]
self.Split(inSegs,outSegs,aSP)
if len(outSegs)==0:
continue
inSegs=copy.deepcopy(outSegs)
for tSeg in inSegs:
segs.append(tSeg)
def Split(self,inSegs,segs,strSP):
bBreaked=False
for inStr in inSegs:
tSegs = inStr.split(strSP)
if len(tSegs)<=1:
segs.append(inStr)
continue
i = 0
while i<len(tSegs)-1:
aSeg=tSegs[i]
aSeg+=strSP
segs.append(aSeg)
i+=1
segs.append(tSegs[-1])
return
if __name__ == "__main__":
wb = CWordBreak()
wb.LoadDictFile("./wordlist.txt")
ymF = open("ym.txt")
ymTxt = ymF.read()
sections=[]
# wb.TxtToSection(ymTxt,sections)
# for aSect in sections:
# print "---------------------------------------------------"
# sents=[]
# wb.SectionToSentence(aSect,sents)
# for aSent in sents:
# print "~~~~~~~~~~~~~~~~"
# print aSent
# sentences=[]
# wb.TxtToSentence(ymTxt,sentences);
# for aSent in sentences:
# print "Sentence::"
# print aSent
strOut=wb.WordBreak(ymTxt)
print strOut
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -