📄 pttrain.py

📁 a python version of Chinese segmentor. Only the complete program, thus you should provide the traini
💻 PY
字号:
#!/usr/bin/python#-*- coding: utf-8 -*-import osimport timeimport randomimport cPickleimport gcclass CPTTrain:    def __init__(self, segment, train):        self.__char_type = {}        data_path = "PTData"        for ind, name in enumerate(["punc", "alph", "date", "num"]):            fn = data_path + "/" + name            if os.path.isfile(fn):                for line in file(fn, "rU"):                    self.__char_type[line.strip().decode("cp936")] = ind            else:                print "can't open", fn                exit()        self.__train_insts = None           # all instances for training.        self.__feats_weight = None          # ["b", "m", "e", "s"][all the features] --> weight.        self.__words_num = None             # total words num in all the instances.        self.__insts_num = None             # namley the sentences' num.        self.__cur_ite_ID = None            # current iteration index.        self.__cur_inst_ID = None           # current index_th instance.        self.__real_inst_ID = None          # the accurate index in training instances after randimizing.        self.__last_update = None           # ["b".."s"][feature] --> [last_update_ite_ID, last_update_inst_ID]        self.__feats_weight_sum = None      # sum of ["b".."s"][feature] from begin to end.        if segment and train or not segment and not train:            print "there is only a True and False in segment and train"            exit()        elif train:            self.Train = self.__Train        else:            self.__LoadModel()            self.Segment = self.__Segment    def __LoadModel(self):        model = "PTData/avgmodel"        print "load", model, "..."        self.__feats_weight = {}        if os.path.isfile(model):            start = time.clock()            self.__feats_weight = cPickle.load(file(model, "rb"))            end = time.clock()            print "It takes %d seconds" %(end - start)        else:            print "can't open", model    def __Train(self, corp_file_name, max_train_num, max_ite_num):        if not self.__LoadCorp(corp_file_name, max_train_num):            return False        starttime = time.clock()                        self.__feats_weight = {}        self.__last_update = {}        self.__feats_weight_sum = {}                for self.__cur_ite_ID in xrange(max_ite_num):            if self.__Iterate():                break        self.__SaveModel()        endtime = time.clock()                print "total iteration times is %d seconds" %(endtime - starttime)        return True    def __GenerateFeats(self, inst):        inst_feat = []        for ind, [c, tag, t] in enumerate(inst):            inst_feat.append([])            if t == -1:                continue            # Cn            for n in xrange(-2, 3):                inst_feat[-1].append("C%d==%s" %(n, inst[ind + n][0]))            # CnCn+1            for n in xrange(-2, 2):                inst_feat[-1].append("C%dC%d==%s%s" %(n, n + 1, inst[ind + n][0], inst[ind + n + 1][0]))            # C-1C1            inst_feat[-1].append("C-1C1==%s%s" %(inst[ind - 1][0], inst[ind + 1][0]))            # Pu(C0)            inst_feat[-1].append("Pu(%s)==%d" %(c, int(t == 0)))            # T(C-2)T(C-1)T(C0)T(C1)T(C2)            inst_feat[-1].append("T-2...2=%d%d%d%d%d" %(inst[ind - 2][2], inst[ind - 1][2], inst[ind][2], inst[ind + 1][2], inst[ind + 2][2]))                    return inst_feat        def __SaveModel(self):        # the last time to sum all the features.        norm = float(self.__cur_ite_ID + 1) * self.__insts_num        for feat in self.__feats_weight_sum:            last_ite_ID = self.__last_update[feat][0]            last_inst_ID = self.__last_update[feat][1]            c = (self.__cur_ite_ID - last_ite_ID) * self.__insts_num + self.__cur_inst_ID - last_inst_ID            self.__feats_weight_sum[feat] += self.__feats_weight[feat] * c            self.__feats_weight_sum[feat] = self.__feats_weight_sum[feat] / norm        cPickle.dump(self.__feats_weight_sum, file("PTData/avgmodel", "wb"))        self.__train_insts = None    def __LoadCorp(self, corp_file_name, max_train_num):        if not os.path.isfile(corp_file_name):            print "can't open", corp_file_name            return False                self.__train_insts = []        self.__words_num = 0        for ind, line in enumerate(file(corp_file_name, "rU")):            if max_train_num > 0 and ind >= max_train_num:                break            self.__train_insts.append(self.__PreProcess(line.strip()))                        self.__words_num += len(self.__train_insts[-1]) - 4        self.__insts_num = len(self.__train_insts)        print "number of total insts is", self.__insts_num        print "number of total characters is", self.__words_num        print                return True    def __PreProcess(self, sent):        inst = []        for i in xrange(2):            inst.append(["<s>", "s", -1])        for word in sent.decode("cp936").split():            rt = word.rpartition("/")            t = self.__char_type.get(rt[0], 4)            inst.append([rt[0], rt[2], t])              # [c, tag, t]        for i in xrange(2):            inst.append(["<s>", "s", -1])                    return inst            def __Segment(self, src):        """suppose there is one sentence once."""        inst = []        for i in xrange(2):            inst.append(["<s>", "s", -1])        for c in src.decode("cp936"):            inst.append([c, "", self.__char_type.get(c, 4)])        for i in xrange(2):            inst.append(["<s>", "s", -1])                feats = self.__GenerateFeats(inst)        tags = self.__DPSegment(inst, feats)              rst = []        for i in xrange(2, len(tags) -2):            if tags[i] in ["s", "b"]:                rst.append(inst[i][0])            else:                rst[-1] += inst[i][0]                        return " ".join(rst).encode("cp936")    def __Iterate(self):        start = time.clock()        print "%d th iteration" %self.__cur_ite_ID        train_list = random.sample(xrange(self.__insts_num), self.__insts_num)        error_sents_num = 0        error_words_num = 0                for self.__cur_inst_ID, self.__real_inst_ID in enumerate(train_list):            num = self.__TrainInstance()            error_sents_num += 1 if num > 0 else 0            error_words_num += num        st = 1 - float(error_sents_num) / self.__insts_num        wt = 1 - float(error_words_num) / self.__words_num        end = time.clock()        print "sents accuracy = %f%%, words accuracy = %f%%, it takes %d seconds" %(st * 100, wt * 100, end - start)        print        return error_sents_num == 0 and error_words_num == 0    def __TrainInstance(self):        cur_inst = self.__train_insts[self.__real_inst_ID]        feats = self.__GenerateFeats(cur_inst)                seg = self.__DPSegment(cur_inst, feats)        return self.__Correct(seg, feats)    def __DPSegment(self, inst, feats):                num = len(inst)        # get all position's score.        value = [{} for i in xrange(num)]        for i in xrange(2, num - 2):            for t in ["b", "m", "e", "s"]:                value[i][t] = self.__GetScore(i, t, feats)        # find optimal path.        tags = [None for i in xrange(num)]        best = [-1 for i in xrange(num)]             # best[i]: [i, i + length(i)) is optimal segment.        length = [None for i in xrange(num)]        for i in xrange(num - 2 - 1, 1, -1):            for dis in xrange(1, 11):                if i + dis > num - 2:                    break                cur_score = best[i + dis]                self.__Tag(i, i + dis, tags)                for k in xrange(i, i + dis):                    cur_score += value[k][tags[k]]                if length[i] is None or cur_score > best[i]:                    best[i] = cur_score                    length[i] = dis                            i = 2        while i < num - 2:            self.__Tag(i, i + length[i], tags)            i += length[i]                    return tags    def __GetScore(self, pos, t, feats):        pos_feats = feats[pos]        score = 0.0        for feat in pos_feats:            score += self.__feats_weight.get(feat + "=>" + t, 0)                    return score    def __Tag(self, f, t, tags):        """tag the sequence tags in the xrange of [f, t)"""        if t - f == 1:            tags[f] = "s"        elif t - f >= 2:            tags[f], tags[t - 1] = "b", "e"            for i in xrange(f + 1, t - 1):                tags[i] = "m"            def __Correct(self, tags, feats):        updates = {}        cur_inst = self.__train_insts[self.__real_inst_ID]        error_words_num = 0        for i in xrange(2, len(cur_inst) - 2):            if tags[i] == cur_inst[i][1]:                continue            error_words_num += 1            pos_feats = feats[i]            target = cur_inst[i][1]            mine = tags[i]            for feat in pos_feats:                updates[feat + "=>" + target] = updates.get(feat + "=>" + target, 0.0) + 1                updates[feat + "=>" + mine] = updates.get(feat + "=>" + mine, 0.0) - 1        self.__Update(updates)                return error_words_num;    def __Update(self, updates):        # update the features weight.        for feat in updates:            pair = self.__last_update.get(feat, [0, 0])            last_ite_ID = pair[0]            last_inst_ID = pair[1]                        c = (self.__cur_ite_ID - last_ite_ID) * self.__insts_num + self.__cur_inst_ID - last_inst_ID            self.__feats_weight_sum[feat] = self.__feats_weight_sum.get(feat, 0) + c * self.__feats_weight.get(feat, 0)                        self.__feats_weight[feat] = self.__feats_weight.get(feat, 0) + updates[feat]            self.__last_update[feat] = [self.__cur_ite_ID, self.__cur_inst_ID]        if __name__ == "__main__":    """try:        import psyco        psyco.full()        print "optimization open"    except:        print "optimization close"        pass"""    #gc.set_threshold(100000,100,10)    #gc.disable()    train = CPTTrain(train = True, segment = False)    train.Train("msr_train.txt", max_train_num = 1000, max_ite_num = 20)    del train            srcs = file("test.in", "rU").readlines()    print "avg"    seg = CPTTrain(train = False, segment = True)    for src in srcs:        print src        src = src.decode("utf-8").encode("cp936")        print seg.Segment(src).decode("cp936").encode("utf-8")        print    del seg
💿 文件大小 4 K
👤 上传用户 koalalee
📂 所属分类 *行业应用
📄 代码行数 299 行
💻 语言类型 Python
🏷️ 相关标签

#segmentor #the #complete #Chinese
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -