⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pinyinphrase.cpp

📁 对MurphyPinyin的改进版
💻 CPP
字号:
/**********************************************************************
 * ** Copyright (C) 2004 MurphyTalk
 * **
 * ** This file is part of the MurphyTalk PinYin Chinese input method,
 * ** which is for the Qtopia Environment.
 * **
 * ** This is partially based on Smart Chinese Input Method 
 * ** by James Su
 * ** Copyright (c) 2002 James Su <suzhe@turbolinux.com.cn>
 * **
 * ** This file may be distributed and/or modified under the terms of the
 * ** GNU General Public License version 2 as published by the Free Software
 * ** Foundation and appearing in the file LICENSE.GPL included in the
 * ** packaging of this file.
 * **
 * ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
 * ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
 * **
 * **
 * ** murphytalk@gmail.com
 * ** $Date: 2004/07/31 17:23:36 $
 * **
 * **********************************************************************/
#include "PinyinPhrase.h"
#include "../public.h"

#include <map>

/////////////////////////////////////////////////////////////////////////
//utf8 <=> ucs-2 convertion
//
//from linux kernel fs/nls/nls_base.c
extern int utf8_mbtowc(ucs4_t *p, const __u8 *s, int n);
int utf8_mbstowcs(ucs4_t *pwcs, const __u8 *s, int n)
{
	__u16 *op;
	const __u8 *ip;
	int size;

	op = pwcs;
	ip = s;
	while (*ip && n > 0) {
		if (*ip & 0x80) {
			size = utf8_mbtowc(op, ip, n);
			if (size == -1) {
				/* Ignore character and move on */
				ip++;
				n--;
			} else {
				op++;
				ip += size;
				n -= size;
			}
		} else {
			*op++ = *ip++;
		}
	}
	return (op - pwcs);
}

extern int utf8_wctomb(__u8 *s, ucs4_t wc, int maxlen);
int utf8_wcstombs(__u8 *s, const ucs4_t *pwcs, int maxlen)
{
	const __u16 *ip;
	__u8 *op;
	int size;

	op = s;
	ip = pwcs;
	while (*ip && maxlen > 0) {
		if (*ip > 0x7f) {
			size = utf8_wctomb(op, *ip, maxlen);
			if (size == -1) {
				/* Ignore character and move on */
				maxlen--;
			} else {
				op += size;
				maxlen -= size;
			}
		} else {
			*op++ = (__u8) *ip;
		}
		ip++;
	}
	return (op - s);
}

std::ostream& utf8_write_phase_string(std::ostream& os,PhraseString& str)
{	
	std::vector<__u8> s;
	int size=str.size()*4;
	s.reserve(size); //utf8
	ucs4_t*p=&str[0];
	int len=utf8_wcstombs(&s[0],p,size);
	for(int i=0;i<len;i++){
		os<<s[i];
	}
	return os;	
}

/////////////////////////////////////////////////////////////////////////
// overrided operators
//
std::ifstream& operator >> (std::ifstream& is, uint32& value)
{
	char buf[sizeof(uint32)];
	for(unsigned int i=0;i<sizeof(buf);i++){
		is>>buf[i];
	}
	
	value=*((uint32*)buf);
	
	return is;
}

std::fstream& operator << (std::fstream& os, uint32& value)
{
	char *p=(char*)(&value);
	os.write(p,sizeof(uint32));
	return os;
}

bool operator == (PhraseString& left,PhraseString& right)
{
	unsigned int size=std::min(left.size(),right.size());
	for(unsigned int i=0;i<size&&left[i]!=0&&right[i]!=0;i++){
		if(left[i]!=right[i])  return false;
	}
	return true;
}



///////////////////////////////////////////////////////////////////////////////////
//

static const char murphytalk_Phrase_index_header [] = "MurphyTalk Pinyin Phrase Index Table";
static const char murphytalk_Phrase_index_version[] = "Ver "FILE_VERSION;
static const char murphutalk_Phrase_file         [] =  
#ifdef X86
"/root/murphytalk_phrase.dat";
#else
"/home/zaurus/.murphytalk/murphytalk_phrase.dat";
#endif



PinyinPhraseTable::PinyinPhraseTable(const char* indexfile)
{
	load_index(indexfile);
}

PinyinPhraseTable::~PinyinPhraseTable()
{
}

bool PinyinPhraseTable::load_index(const char* indexfile)
{
	std::ifstream ifs(indexfile);
	if (!ifs) return false;
	if (input (ifs) && m_phrases.size () != 0) return true;
	return false;
}

bool PinyinPhraseTable::save_index(const char* indexfile)
{
	std::fstream fs(murphutalk_Phrase_file,std::ios::in|std::ios::out);
	if (!fs) return false;

	printX86("updating phrase dat file for frequencies ...\n");
	//update changed frequencies to phrase file
	for(PhraseOffsetToFreqMap::iterator pos=m_recent_hit_cache.begin();
	    pos!=m_recent_hit_cache.end();pos++){
		printX86("from hit cache -> offset %d,freq %d\n",pos->first,pos->second);
		fs.seekp(pos->first,std::ios::beg);
		fs<<pos->second;
	}
	m_recent_hit_cache.clear();

	//append new phrases keys and offsets in phrase file to index 
	std::ofstream out(indexfile,std::ios::app|std::ios::out);
	if (!out) return false;

	printX86("updating phrase index ...\n");
	for(PinyinPhraseEntryVector::iterator pos=m_new_phrases_cache.begin();
	    pos!=m_new_phrases_cache.end();pos++){
		pos->output_text(out);
	}
	m_new_phrases_cache.clear();
	
	return true;
}

#if 0
//rewrite the whole phrase index file,merge phrase entries with same key into the same line
bool PinyinPhraseTable::output (std::ostream &os)
{
	os<<murphytalk_Phrase_index_header<<std::endl;
	os<<murphytalk_Phrase_index_version<<std::endl;
	os<<m_phrases.size()<<std::endl;

	for(PinyinPhraseEntryVector::iterator pos=m_phrases.begin();pos!=m_phrases.end();pos++){
		pos->output_text(os);
	}
	return true;
}
#endif

bool PinyinPhraseTable::input (std::istream &is)
{
	char header [40];

	if (!is) return false;
	
	is.getline (header, 40);
	if (strncmp (header,
		murphytalk_Phrase_index_header,
		strlen (murphytalk_Phrase_index_header)) != 0) {
			std::cout<<"invalidate Phrase table index file"<<std::endl;
			return false;
	}

	is.getline (header, 40);
	if (strncmp (header,
		murphytalk_Phrase_index_version,
		strlen (murphytalk_Phrase_index_version)) != 0) {
			std::cout<<"invalidate Phrase table index file version:"<<std::endl;
			return false;
	}

	typedef std::map<String,unsigned int> ENTRY_MAP;
	ENTRY_MAP lookup;
	ENTRY_MAP::iterator lookup_pos;

 #ifdef DEBUG
	uint32 i=0;
 #endif
	while(!is.eof())
	{
		PinyinPhraseEntry phrase_entry(is);

		if(phrase_entry.isValid()){
			//is entry with same key avaliable?
			String keystr=phrase_entry.get_key().get_key_string();
			lookup_pos=lookup.find(keystr);
			if(lookup_pos!=lookup.end()){
				//got you!
				printX86("-> %s ...",keystr.c_str());
				//lookup_pos->second->append(phrase_entry);
				m_phrases[lookup_pos->second].append(phrase_entry);
				printX86("OK\n");
			}
			else{
				m_phrases.push_back(phrase_entry);
				//PinyinPhraseEntryVector::iterator last=m_phrases.end();
				//last--;
				lookup[keystr]=m_phrases.size()-1;
			}
		}

#ifdef DEBUG
		i++;
#endif
		if(!is||is.eof()){
#ifdef DEBUG
			if(!is){				
				printf("No.%d,exception\n",i);
			}
#endif
			break;
		}
	}

	std::sort(m_phrases.begin(),m_phrases.end(),PinyinPhraseKeyLessThan());
#ifdef DEBUG
	for(PinyinPhraseEntryVector::iterator pos=m_phrases.begin();pos!=m_phrases.end();pos++){
		printf("%s\n",pos->get_key().get_key_string().c_str());
	}
	printf("%d Phrases entries sorted\n",m_phrases.size());
#endif	
	m_last_matched_phrases_range.first=m_last_matched_phrases_range.second=m_phrases.end();
	return true;
}

void PinyinPhraseTable::insert(PinyinPhraseEntry& phrase)
{
	m_phrases.push_back(phrase);
	std::sort(m_phrases.begin(),m_phrases.end(),PinyinPhraseKeyLessThan());
}

unsigned int PinyinPhraseTable::find_phrases(PhraseOffsetFrequencyPairVector& phrases,PinyinPhraseKey& pinyin)
{
	phrases.clear ();

	PinyinPhraseEntryVectorPosRangePair range =
		std::equal_range(m_phrases.begin(), m_phrases.end(), pinyin,PinyinPhraseKeyLessThan());

	m_last_matched_phrases_range=range;

	for (PinyinPhraseEntryVector::const_iterator i = range.first; i!= range.second; i++) {
		PinyinPhraseEntry& entry=const_cast<PinyinPhraseEntry&>(*i);
#ifdef X86
		entry.get_key();
		String s=entry.get_key().get_key_string();
		printf("%s\n",s.c_str());
#endif	      
		entry.get_all_phrases(phrases);
	}

	if (!phrases.size ()) return 0;

	return phrases.size ();
}


unsigned int PinyinPhraseTable::get_phrases_by_offsets(PhraseOffsetFrequencyPairVector& phrases_pair,
						       PhraseStringVector& strs)
{
	std::ifstream in(murphutalk_Phrase_file);
	if(!in){
		return 0;
	}
	else{
		ucs4_t*      p;
		PhraseString temp;
		String       raw;
		uint32       freq;

		//read(from file or file hit cache) frequncies first
		for(PhraseOffsetFrequencyPairVector::iterator pos=phrases_pair.begin();
		    pos!=phrases_pair.end();pos++){
			//move to the right position
       			in.seekg(pos->first,std::ios::beg);
			printX86("moved to offset %d ... ",(unsigned int)in.tellg());
			//read the frequency
			in>>freq;
			printX86("freq in file is %d,file get position is %u\n",freq,(unsigned int)in.tellg());
			
			//have we inputed this phrase(or:hit this phrase) before?
			//check our recent hit cache
			PhraseOffsetToFreqMap::iterator pos_cache = m_recent_hit_cache.find(pos->first);
			if(pos_cache==m_recent_hit_cache.end()){
				//not cached,use freq read from file
				pos->second=freq;
			}
			else{
				//use cached freq
				pos->second=pos_cache->second;
				printX86("use cached freq = %d\n",pos_cache->second);
			}

		}		

		//sort by frequency
		std::sort (phrases_pair.begin (), phrases_pair.end (), PhraseOffsetFrequencyPairGreaterThanByFrequency ());

		//now read phrase strings - in the right freqency order
		strs.clear();
		for(PhraseOffsetFrequencyPairVector::iterator pos=phrases_pair.begin();
		    pos!=phrases_pair.end();pos++){
			//move to the right position
       			in.seekg(pos->first+sizeof(PhraseOffset),std::ios::beg);
			printX86("get phrase : moved to offset %d\n",(unsigned int)in.tellg());
			//read the phrase text,which is encoded in utf8
			in>>raw;
			//make enough room
			temp.reserve(raw.size());
			p=&temp[0];
			//all set to zero
			memset(p,0,raw.size());			
			int len=utf8_mbstowcs(p,(const __u8 *)raw.c_str(),raw.size());
			strs.push_back(PhraseString(p,p+len));
	        }		
		return strs.size();		
	}	
}

bool PinyinPhraseTable::append_phrase(PhraseString& str,const char* pinyin)
{	
	PinyinPhraseEntry entry(pinyin);
	
	PinyinPhraseEntryVector::iterator pos=std::find(m_phrases.begin(),m_phrases.end(),entry);
	if(pos!=m_phrases.end()){
		//same entry
		//we have same pinyin here,have to make sure we don't have the exactly same phrase

		//read the matched phrases from file
		PhraseStringVector strs;
		PhraseOffsetFrequencyPairVector temp_phrases;
		for(PhraseOffsetVector::iterator i=pos->m_phrases.begin();i!=pos->m_phrases.end();i++){
			temp_phrases.push_back(std::make_pair(*i,0));
		}
				
		get_phrases_by_offsets(temp_phrases,strs);
		for(PhraseStringVector::iterator pos_str=strs.begin();pos_str!=strs.end();pos_str++){
			if(*pos_str==str){
				//oops,already got this phrase
				printX86("duplicated phrase\n");
				return false;
			}
		}
	}

	//append phrase to phrase lib file
	std::ofstream out(murphutalk_Phrase_file,std::ios::app|std::ios::out);
	if(!out){
		return false;
	}
	else{
		//out.seekp(0,std::ios::end);
		PhraseOffset offset = out.tellp();
		//save offset to phrase entry
	        entry.insert(offset);
		
		printX86("new phrase %s,offset is %u\n",pinyin,offset);
		if(pos!=m_phrases.end()){
			pos->insert(offset);
		}
		else{
			insert(entry);
		}

	        //put into new phrase cache
		m_new_phrases_cache.push_back(entry);

		//first of all output a frequency of 1
		uint32 freq=1;
		out.write((char*)&freq,sizeof(freq));
		
		//this space is used to seperate this phrase with the next one
		str.push_back(0x0020);
		//terminate the string
		str.push_back(0);
		//convert to utf8 and append to file
		utf8_write_phase_string(out,str);

		return true;
	}
}

void PinyinPhraseTable::set_frequency(uint32 offset,uint32 freq)
{
#if 0
	if(m_last_matched_phrases_range.first!=m_phrases.end()&&
	   m_last_matched_phrases_range.second!=m_phrases.end()){
		for (PinyinPhraseEntryVector::const_iterator i = m_last_matched_phrases_range.first; 
		     i!= m_last_matched_phrases_range.second; i++) {
			PinyinPhraseEntry& entry=const_cast<PinyinPhraseEntry&>(*i);
			if(entry.set_frequency(offset,freq))  break;
		}
	}
#else
	printX86("PinyinPhraseTable::set_frequency(%d,%d)\n",offset,freq);

	//has this phrase's frequency been cached yet?
	PhraseOffsetToFreqMap::iterator pos_cache = m_recent_hit_cache.find(offset);
	if(pos_cache==m_recent_hit_cache.end()){
		m_recent_hit_cache[offset]=freq;
	}
	else{
		printX86("found in hit cache\n");
		pos_cache->second=freq;
	}
#endif
}

PinyinPhraseEntry::PinyinPhraseEntry(std::istream &is)
{
	input_text(is);
}

PinyinPhraseEntry::PinyinPhraseEntry(const char*pinyin)
{
	m_key.set_key(pinyin);
}

PinyinPhraseEntry::PinyinPhraseEntry(PinyinPhraseKey& key):m_key(key)
{
}

PinyinPhraseEntry::~PinyinPhraseEntry()
{
}

#if 0
bool PinyinPhraseEntry::set_frequency(uint32 offset,uint32 freq)
{
	for(PhraseOffsetVector::iterator pos=m_phrases.begin();pos!=m_phrases.end();pos++){
		if(*pos==offset){
			pos->second=freq;
			return true;
		}
	}
#ifdef X86
	printf("offset %d -> not found this offset in last matched entry\n",offset);
#endif
	return false;
}
#endif

std::istream& PinyinPhraseEntry::input_text (std::istream &is)
{
	String keystr;

	is>>keystr;
	
	m_key.set_key(keystr.c_str());

	if(m_key.isValid()){
		PhraseOffset offset;
			
		int count;
		is>>count;
		for(int i=0;i<count;i++){
			is>>offset;

			insert(offset);
		}
	}

	return is;
}

std::ostream& PinyinPhraseEntry::output_text(std::ostream &os)
{
	for(PinyinKeyVector::iterator pos=m_key.m_keys.begin();pos!=m_key.m_keys.end();pos++){
		os<<pos->get_key_string();
	}
	os<<"\t"<<m_phrases.size()<<"\t";
	for(PhraseOffsetVector::iterator pos=m_phrases.begin();pos!=m_phrases.end();pos++){
		os<<*pos<<" ";
	}
	os<<std::endl;
	return os;
}

unsigned int PinyinPhraseEntry::get_all_phrases(PhraseOffsetFrequencyPairVector& vect)
{
	for(PhraseOffsetVector::iterator pos=m_phrases.begin();pos!=m_phrases.end();pos++){
		vect.push_back(std::make_pair(*pos,0));
	}
	return 0;
}

PinyinPhraseKey::PinyinPhraseKey()
{
}

PinyinPhraseKey::PinyinPhraseKey(const char *keystr)
{
	set_key(keystr);
}

PinyinPhraseKey::~PinyinPhraseKey()
{
}

void PinyinPhraseKey::set_key(const char *keystr)
{
        PinyinKey::parse_pinyin_key(scim_default_pinyin_validator,m_keys,keystr);
}

String PinyinPhraseKey::get_key_string()
{
	String s;
	for(unsigned int i=0;i<m_keys.size();i++){
		s+=m_keys[i].get_key_string()+" ";
	}
	return s;
}

/*
 *  Revision history
 * 
 *  $Log: PinyinPhrase.cpp,v $
 *  Revision 1.4  2004/07/31 17:23:36  Lu Mu
 *  1)cache changes to index and phrase file(frequency and new phrase)
 *  2)support write to phrase index and phrase file format V0.03
 *
 *  Revision 1.3  2004/07/26 18:19:23  Lu Mu
 *  support the reading of phrase file format V0.03
 *
 *  Revision 1.2  2004/07/20 11:25:29  Lu Mu
 *  (1)phrase frequency
 *  (2)self define phrase
 *
 *  Revision 1.1  2004/07/17 07:10:30  Lu Mu
 *  phrase support
 *
 *
 */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -