⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 winpy2phrase.py

📁 这是一个嵌入式系统上运行的中文输入法
💻 PY
字号:
#################################################################
# tool to extract 2-hanzi phrases from WINPY.txt
# also create MurphyTalk phrase file and pinyin phrase index file
# 
# you have to install cjkcodec 
#
# MurphyTalk 2004/7
# $Date: 2004/07/31 17:21:05 $

import codecs
import cjkcodecs


#fileter a line in winpy.txt
#only filter 2-hanzi phrases
#result -> pair (pinyin,phrase)
def filter(line):
	def is_alpha(a):
		return (a>='a' and a<='z') or a=="*"

	pair=()	
	if len(line)>4:
		if(not is_alpha(line[0]) and not is_alpha(line[1]) and is_alpha(line[2])):
			pinyin=line[2:]
			pinyin=pinyin[:len(pinyin)-2] #get rid of the last return char
			pair=(pinyin,line[:2])
			
	return pair

#map -> {pinyin:[phrases,...]}
def build_phrase_map(map):
	fSrc=codecs.open("WINPY.TXT","r","cjkcodecs.gb2312")

	lines=fSrc.readlines()
	fSrc.close()
	
	a=""
	phrases=[]
	i=1
	count=0
	for l in lines:
		pair=filter(l)
		if len(pair)==2:
			print "Line %d filtered"%i
			if(map.has_key(pair[0])):
				phrases=map[pair[0]]
				phrases.append(pair[1])
			else:
				map[pair[0]]=[pair[1]]
			count+=1
		else:
			print "Line %d discarded"%i
		i+=1
			
	print "%d phrases extracted"%count		

def main():
	phrases_map={}
	build_phrase_map(phrases_map)
	fPhrase=codecs.open("murphytalk_phrase.dat","wt","utf-8")
	fPhrase.write("MurphyTalk Phrase Table\nVer 0.03\n");

	fIndex=open("murphytalk_phrase_idx.txt","wt")
	fIndex.write("MurphyTalk Pinyin Phrase Index Table\nVer 0.03\n");

	
	keys=phrases_map.keys()
	
	freq=chr(0)+chr(0)+chr(0)+chr(0)
	keys.sort()
	for k in keys:
		fIndex.write(k);
		fIndex.write("\t%d\t"%len(phrases_map[k]));
		for phrase in phrases_map[k]:
			#record offset in phrase file into index file
			fIndex.write("%d "%fPhrase.tell())
			#write frequency into phrase file (uint32,4 bytes)
			#we don't use the codec but its underlying file stream here
			fPhrase.stream.write(freq)
			#write phrase text into phrase file
			fPhrase.write("%s "%phrase)
			
		fIndex.write("\n");

	fIndex.close()
	fPhrase.close()
	

main()

#
# $Log: winpy2phrase.py,v $
# Revision 1.3  2004/07/31 17:21:05  Lu Mu
# create phrase file in format V0.03
#
#

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -