⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dataset.py.svn-base

📁 解码器是基于短语的统计机器翻译系统的核心模块
💻 SVN-BASE
字号:
#!/usr/bin/python2.3'''Decoder interface:Dataset.process() expects a function, which in turn takes a Sentence as inputand produces a Sentence or list of Sentences as output.The input Sentence will be marked with the <seg> tag it was found inthe input file with.The output Sentences should be marked with <seg> tags if they are tobe marked as such in the output file.'''import sys, sgmllib, xml.sax.saxutils, logdef attrs_to_str(d):    if len(d) == 0:        return ""    l = [""]+["%s=%s" % (name, xml.sax.saxutils.quoteattr(value)) for (name, value) in d]    return " ".join(l)def attrs_to_dict(a):    d = {}    for (name, value) in a:	if d.has_key(name.lower()):	    raise ValueError, "duplicate attribute names"	d[name.lower()] = value    return ddef strip_newlines(s):    return " ".join(s.split())class Sentence(object):    def __init__(self, words=None, meta=None):        if words is not None:            self.words = list(words)        else:            self.words = []        if meta is not None:            self.meta = meta        else:            self.meta = []    def mark(self, tag, attrs):        self.meta.append((tag, attrs, 0, len(self.words)))    def getmark(self):        if len(self.meta) > 0:            (tag, attrs, i, j) = self.meta[-1]            if i == 0 and j == len(self.words):                return (tag, attrs)            else:                return None        else:            return None    def unmark(self):        mark = self.getmark()        if mark is not None:            self.meta = self.meta[:-1]        return mark    def __cmp__(self, other):        return cmp((self.words, self.meta), (other.words, other.meta))    def __str__(self):        def cmp_spans((tag1,attr1,i1,j1),(tag2,attr2,i2,j2)):            if i1==i2<=j1==j2:                return 0            elif i2<=i1<=j1<=j2:                return -1            elif i1<=i2<=j2<=j1:                return 1            else:                return cmp((i1,j1),(i2,j2)) # don't care        # this guarantees that equal spans will come out nested        # we want the later spans to be outer        # this relies on stable sort        open = [[] for i in xrange(len(self.words)+1)]        # there seems to be a bug still with empty spans        empty = [[] for i in xrange(len(self.words)+1)]        close = [[] for j in xrange(len(self.words)+1)]        for (tag,attrs,i,j) in sorted(self.meta, cmp=cmp_spans):            if i == j:                # do we want these to nest?                empty[i].append("<%s%s/>" % (tag, attrs_to_str(attrs)))            open[i].append("<%s%s>" % (tag, attrs_to_str(attrs)))            close[j].append("</%s>" % tag)        result = []        if len(empty[0]) > 0:            result.extend(empty[0])        for i in xrange(len(self.words)):            if i > 0:                result.append(" ")            result.extend(reversed(open[i]))            result.append(self.words[i])            result.extend(close[i+1])            if len(empty[i+1]) > 0:                result.extend(empty[i+1])        return "".join(result)    def __add__(self, other):        if type(other) in (list, tuple):            return Sentence(self.words + list(other), self.meta)        else:            othermeta = [(tag, attrs, i+len(self.words), j+len(self.words)) for (tag, attrs, i, j) in other.meta]            return Sentence(self.words + other.words, self.meta+othermeta)def read_raw(f):    """Read a raw file into a list of Sentences."""    if type(f) is str:        f = file(f, "r")    inputs = []    i = 0    for line in f:        sent = process_sgml_line(line, i)        sent.mark('seg', [('id',str(i))])        inputs.append(sent)        i += 1    return inputsclass Dataset(object):    def __init__(self, id=None):	self.id = id	self.docs = {}	self.sysids = []	self.langs = {}    def read(self, f):        '''Read a file into the dataset. Returns (root, sysids)'''        if type(f) is str:            f = file(f, "r")        p = DatasetParser(self)        p.feed(f.read())        p.close()        return (p.root,p.sysids)    def read_raw(self, f, docid, setid=None, sysid=None, lang=None):        """Read a raw file into the dataset."""        if setid is not None:            if self.id is not None and self.id != setid:                raise ValueError, "Set ID does not match"            else:                self.id = setid        if sysid not in self.sysids:            self.sysids.append(sysid)            self.langs[sysid] = lang        if type(f) is str:            f = file(f, "r")        doc = self.docs.setdefault(docid, Document(docid))        i = 0        for line in f:            if len(doc.segs)-1 < i:                doc.segs.append(Segment(i))            if doc.segs[i].versions.has_key(sysid):                raise ValueError, "multiple versions from same system"            doc.segs[i].versions[sysid] = process_sgml_line(line, i)            doc.segs[i].versions[sysid].mark('seg', [('id',str(i))])            i += 1        return (None, [sysid])    def write(self, f, tag, sysids=None):        if type(f) is str:            f = file(f, "w")        f.write(self.string(tag, sysids))    def write_raw(self, f, sysid=None):        if type(f) is str:            f = file(f, "w")        for seg in self.segs():            f.write(" ".join(seg.versions[sysid].words))            f.write("\n")    def string(self, tag, sysids=None):	if sysids is None:	    sysids = self.sysids	elif type(sysids) is str:	    sysids = [sysids]	attrs = [('setid', self.id)]	if self.langs.has_key(None):	    attrs.append(('srclang', self.langs[None]))	trglangs = [self.langs[sysid] for sysid in sysids if sysid is not None]	for lang in trglangs[1:]:	    if lang != trglangs[0]:		raise ValueError, "Inconsistent target language"	if len(trglangs) >= 1:	    attrs.append(('trglang', trglangs[0]))        return "<%s%s>\n%s</%s>\n" % (tag, 				      attrs_to_str(attrs),				      "".join([doc.string(sysid) for doc in self.docs.values() for sysid in sysids]),                                      tag)        def process(self, processor, sysid, lang, srcsysid=None):	if sysid in self.sysids:	    raise ValueError, "sysid already in use"	else:	    self.sysids.append(sysid)	    self.langs[sysid] = lang	for seg in self.segs():            if log.level >= 2:                sys.stderr.write("Input: %s\n" % str(seg.versions[srcsysid]))            seg.versions[sysid] = processor(seg.versions[srcsysid])            if log.level >= 2:                if type(seg.versions[sysid]) is not list:                    sys.stderr.write("Output: %s\n" % str(seg.versions[sysid]))                else:                    sys.stderr.write("Output (1st): %s\n" % str(seg.versions[sysid][0]))                            def segs(self):        for doc in self.docs.values():            for seg in doc.segs:                yield segclass Document(object):    def __init__(self, id):	self.id = id	self.segs = []    def string(self, sysid):	attrs = [('docid', self.id)]	if sysid is not None:	    attrs.append(('sysid', sysid))	return "<doc%s>\n%s</doc>\n" % (attrs_to_str(attrs),					"".join([seg.string(sysid) for seg in self.segs]))	class Segment(object):    def __init__(self, id=None):	self.id = id	self.versions = {}    def string(self, sysid):        v = self.versions[sysid]        if type(v) is not list:            v = [v]        output = []        for i in xrange(len(v)):            output.append(str(v[i]))            output.append('\n')        return "".join(output)def process_sgml_line(line, id=None):    p = DatasetParser(None)    p.pos = 0    p.words = []    p.meta = []    p.feed(line)    p.close()    sent = Sentence(p.words, p.meta)    return sentclass DatasetParser(sgmllib.SGMLParser):    def __init__(self, set):        sgmllib.SGMLParser.__init__(self)	self.words = None	self.sysids = []	self.set = set        self.mystack = []    def handle_starttag(self, tag, method, attrs):        thing = method(attrs)        self.mystack.append(thing)    def handle_endtag(self, tag, method):        thing = self.mystack.pop()        method(thing)    def unknown_starttag(self, tag, attrs):        thing = self.start(tag, attrs)        self.mystack.append(thing)    def unknown_endtag(self, tag):        thing = self.mystack.pop()        self.end(tag, thing)            def start_srcset(self, attrs):	attrs = attrs_to_dict(attrs)	if self.set.id is None:	    self.set.id = attrs['setid']	if 0 and self.set.id != attrs['setid']:	    raise ValueError, "Set ID does not match"	self.lang = attrs['srclang']	self.root = 'srcset'        return None    def start_refset(self, attrs):	attrs = attrs_to_dict(attrs)	if self.set.id is None:	    self.set.id = attrs['setid']	if 0 and self.set.id != attrs['setid']:	    raise ValueError, "Set ID does not match"	if self.set.langs.setdefault(None, attrs['srclang']) != attrs['srclang']:	    raise ValueError, "Source language does not match"	self.lang = attrs['trglang']	self.root = 'refset'        return None    def start_tstset(self, attrs):	attrs = attrs_to_dict(attrs)	if self.set.id is None:	    self.set.id = attrs['setid']	if 0 and self.set.id != attrs['setid']:	    raise ValueError, "Set ID does not match"	if 0 and self.set.langs.setdefault(None, attrs['srclang']) != attrs['srclang']:	    raise ValueError, "Source language does not match"	self.lang = attrs['trglang']	self.root = 'tstset'        return None    def end_srcset(self, thing):        for sysid in self.sysids:            if sysid not in self.set.sysids:                self.set.sysids.append(sysid)                self.set.langs[sysid] = self.lang    end_refset = end_tstset = end_srcset    def start_doc(self, attrs):	attrs = attrs_to_dict(attrs)	self.doc = self.set.docs.setdefault(attrs['docid'], Document(attrs['docid']))	self.seg_i = 0	if self.root == 'srcset':	    self.sysid = None	else:	    self.sysid = attrs['sysid']        if self.sysid not in self.sysids:            self.sysids.append(self.sysid)        return None    def end_doc(self, thing):	pass    def start_seg(self, attrs):        thing = ('seg', attrs, 0, None)	attrs = attrs_to_dict(attrs)	if len(self.doc.segs)-1 < self.seg_i:	    self.doc.segs.append(Segment(attrs.get('id', None)))	self.seg = self.doc.segs[self.seg_i]        if 0 and self.seg.id is not None and attrs.has_key('id') and self.seg.id != attrs['id']:	    raise ValueError, "segment ids do not match (%s != %s)" % (str(self.seg.id), str(attrs.get('id', None)))	if self.seg.versions.has_key(self.sysid):	    raise ValueError, "multiple versions from same system"	self.pos = 0        self.words = []        self.meta = []        return thing    def end_seg(self, thing):        (tag, attrs, i, j) = thing        self.meta.append((tag, attrs, i, self.pos))	self.seg_i += 1	self.seg.versions[self.sysid] = Sentence(self.words, self.meta)	self.words = None    """# Special case for start and end of sentence    def start_s(self, attrs):        if self.words is not None:            self.pos += 1            self.words.append('<s>')        return None    def end_s(self, thing):        if self.words is not None:            self.pos += 1            self.words.append('</s>')"""    def start(self, tag, attrs):        if self.words is not None:            return (tag, attrs, self.pos, None)        else:            return None    def end(self, tag, thing):        if self.words is not None:            (tag, attrs, i, j) = thing            self.meta.append((tag, attrs, i, self.pos))    def handle_data(self, s):        if self.words is not None:            words = s.split()            self.pos += len(words)	    self.words.extend(words)if __name__ == "__main__":    s = Dataset()    for filename in sys.argv[1:]:        s.read_raw(filename, 'whatever', 'whatever', filename, 'English')    s.write(sys.stdout, 'tstset')

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -