crfs.py

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Python 代码 · 共 120 行

PY
120
字号
"""import statements. """from edu.umass.cs.mallet.base.types import InstanceListfrom edu.umass.cs.mallet.base.fst import CRF4from edu.umass.cs.mallet.base.pipe.iterator import LineGroupIteratorfrom edu.umass.cs.mallet.base.pipe import SimpleTaggerSentence2TokenSequencefrom edu.umass.cs.mallet.base.pipe import SerialPipes, Pipefrom java.util.regex import Patternfrom java.io import FileReader, File, ObjectOutputStream, FileOutputStreamfrom java.io import FileInputStream, ObjectInputStreamfrom java.lang import Doubleimport jarray"""make a serial pipe from a python sequence of pipes.  The first element in thepipe should pipe instances with data of the type given by the InstanceList thatwill be added and produce a TokenSequence, intermediates should take andproduce Instances with data of type TokenSequence and the final one should takeinstances of type TokenSequence and produce instance with data of typeFeatureVectorSequence."""def List2Pipe(pipeSequence, defaultLabel):    p=SerialPipes(jarray.array(pipeSequence, Pipe))    p.getTargetAlphabet().lookupIndex(defaultLabel)    return p"""Takes a pipe and a file name and produces an instance list based on that pipeand a LineGroupIterator.  The optional argument seperator specifies whatseperates instances from eachother.  For example, when doing part of speechtagging an instance is a sentence.  Each word in the sentence would have aseperate line and a line matching the regular expression specified by seperatorwould terminate the current sentence. """def LineGroupInstanceList(pipe, fileName, seperator="^\\s*$"):    data = InstanceList(pipe)    data.add(LineGroupIterator(FileReader(File(fileName)),		               Pattern.compile(seperator), 			       1))    return data"""add some extra data to an instance list. """def LineGroupInstanceAdd(data,fileName, seperator="^\\s*$"):    data.add(LineGroupIterator(FileReader(File(fileName)),		Pattern.compile(seperator),		1))"""Create and initialize a CRF with states read from data, of order given by thesequence orderList (lower numbers are backoff levels), and a default label andwith gaussian prior variance as given.  The allowedPattern, forbiddenPatternand connected control what state transitions are permissible.  Allowedtransitions (from LABEL1 to LABEL2) are ones where "LABEL1,LABEL2" match theallowedPattern but not the forbiddenPattern.  For example:forbiddenPattern="O,I-.*"would disallow transitions from state "O" to state "I-NP", "I-PP" and so on. allowedPattern="B-(.*),I-\\1|I-(.*),I-\\2|.*,B-.*|.*,O"would allow e.g. B-NP,I-NP but not B-NP,I-VP."""def initNewCRF(data, orderList, defaultLabel, gaussianPriorVariance, 	allowedPattern=".*", forbiddenPattern="\\s", connected=1):    # some default things that most users won't want to deal with      forbiddenPattern = Pattern.compile(forbiddenPattern)    allowedPattern = Pattern.compile(allowedPattern)    defaults = None    orderArray = jarray.array(orderList,"i")    crf = CRF4(data.getPipe(), None)     startName = crf.addOrderNStates(data, orderArray, defaults,	    defaultLabel, forbiddenPattern, allowedPattern, connected);    crf.setGaussianPriorVariance (gaussianPriorVariance);    for i in range(0,crf.numStates()):        crf.getState(i).setInitialCost(Double.POSITIVE_INFINITY)    crf.getState(startName).setInitialCost(0.0);    return crf""" Save a CRF model to a file""" def saveModel(crf, file):    s=ObjectOutputStream(FileOutputStream(file))    s.writeObject(crf);    s.close();""" Read a CRF model from a file""" def loadModel(file):    s = ObjectInputStream(FileInputStream(file))    crf = s.readObject()    s.close()    return crf"""Print out helpful debugging information about data set."""def printInstanceInfo (name, instanceList):   print "Number of %s instances = %d" % (name, instanceList.size ()) """Print out the number of features and the value of the labels we found (so the    user can do sanity checking)."""def printDataInfo(p):#    print("Number of features in data: "+ p.getDataAlphabet().size().toString())    print "Number of features in data = %d" % p.getDataAlphabet().size()    targets = p.getTargetAlphabet();    buf = "Labels:"    for i in range(0,targets.size()):      buf+=" "+targets.lookupObject(i).encode()    print buf

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?