⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 owtextfile.py

📁 orange源码 数据挖掘技术
💻 PY
字号:
"""
<name>Text File</name>
<description>Loads XML File</description>
<icon>icons/TextFile.png</icon>
<priority>3500</priority>
"""
from qt import *
from OWWidget import *
import OWGUI, OWToolbars, OWDlgs
from xml.sax import make_parser, handler
from orngTextCorpus import TextCorpusLoader, loadWordSet
import os
import modulTMT as lemmatizer
from OWTools import *
class XMLEcho(handler.ContentHandler):
    def __init__(self, lv):
        self.lv = lv
        self.chars = []
        self.lv.lastAdded = None
        self.lv.parent = self.lv
        self.tags = []
    def startElement(self, name, attrs):    
        if not name in self.tags:
            self.tags.append(name)
        parent = self.lv
        self.lv  = (self.lv.lastAdded == None) and QListViewItem(self.lv) or QListViewItem(self.lv, self.lv.lastAdded)
        parent.lastAdded = self.lv
        self.lv.parent = parent
        self.lv.lastAdded = None
        self.lv.setText(0, "<%s %s>" % (name, " ".join(["%s=\"%s\"" % (k, v) for k, v in attrs.items()])))
    def endElement(self, name):
        str =  "".join(self.chars).strip(" \n\t\r")
        if len(str):
            item = QListViewItem(self.lv)
            item.setText(0,"TEXT")
            item.myText = str
        self.chars = []
        self.lv = self.lv.parent
        new = QListViewItem(self.lv, self.lv.lastAdded)
        new.setText(0, "</%s>" % name)
        self.lv.lastAdded = new
   
    def characters(self, chrs):                              
        self.chars.append(chrs)     
class OWTextFile(OWWidget):
    settingsList = []                    
    contextHandlers = {}
    
    def __init__(self, parent=None, signalManager=None):
        OWWidget.__init__(self, parent, signalManager, 'Text File')
        
        self.inputs = []
        self.outputs = [("Documents", ExampleTable)]
            
        self.mainArea.setFixedWidth(0)
        ca = QFrame(self.controlArea)
        ca.adjustSize()
        gl=QGridLayout(ca,5,3,5)      
        
        col1 = QVBox(ca)
        
        # file browser
        box = QHGroupBox("Data File", col1)
        self.fileNameLabel = QLabel(box)
        self.fileNameLabel.setMinimumWidth(350)
        button = OWGUI.button(box, self, '...', callback = self.browseFile, disabled=0)
        button.setMaximumWidth(25)
        # XML table
        QLabel(col1).setText("XML Document")
        self.listView = QListView(col1)
        self.listView.setAllColumnsShowFocus(1)
        self.listView.setRootIsDecorated(1) 
        self.listView.addColumn("Document", 500) 
        self.listView.setSorting(-1)        
        
        # text edit -- displat text node of XML
        QLabel(col1).setText("Node text")
        self.textEdit = QTextView(col1)
        gl.addMultiCellWidget(col1, 0, 4, 0, 0)
        
        self.connect( self.listView, SIGNAL( 'clicked( QListViewItem* )' ),  self.fillText)
    
        self.listTags = []
        self.listTagsSelected = []
        col2 = QVGroupBox("Tags", ca)
        self.listBoxTags = OWGUI.listBox(col2, self, "listTagsSelected", "listTags")
        gl.addMultiCellWidget(col2, 0, 4, 1, 1)
        
        preproc = QVGroupBox("Preprocessing info", ca)
        hboxLem = QHBox(preproc)
        hboxStop = QHBox(preproc)
        
        startfile = os.path.join(str(orangedir), 'OrangeWidgets', 'TextData','.')
        QLabel('Lemmatizer:', hboxLem)
        self.lemmatizer = '(none)'
        items = ['(none)']      
        items.extend([a for a in os.listdir(startfile) if a[-3:] == 'fsa'])              
        OWGUI.comboBox(hboxLem, self, 'lemmatizer', items = items, sendSelectedValue = 1)
            
        QLabel('Stop words:', hboxStop)
        self.stopwords = '(none)'
        items = ['(none)']
        items.extend([a for a in os.listdir(startfile) if a[-3:] == 'txt'])  
        OWGUI.comboBox(hboxStop, self, 'stopwords', items = items, sendSelectedValue = 1) 
        
        preproc.setFixedHeight(100)
        gl.addWidget(preproc, 0, 2)
        
        col3 = QVGroupBox("Separation tags", ca)
        self.documentTag = ""
        self.categoriesTag = ""
        
        hbox2 = QHGroupBox("Content tag", col3)
        vbox2 = QVBox(hbox2)
        OWGUI.button(vbox2, self, ">", self.onContentAdd)
        OWGUI.button(vbox2, self, "<", self.onContentRemove)
        self.contentTag = ""
        OWGUI.lineEdit(hbox2, self, "contentTag")        
        
        hbox4 = QHGroupBox("Category tag", col3)
        vbox4 = QVBox(hbox4)
        OWGUI.button(vbox4, self, ">", self.onCategoryAdd)
        OWGUI.button(vbox4, self, "<", self.onCategoryRemove)
        self.categoryTag = ""
        OWGUI.lineEdit(hbox4, self, "categoryTag") 
        
        hbox5 = QHGroupBox("Additional tags", col3)
        vbox5 = QVBox(hbox5)
        OWGUI.button(vbox5, self, ">", self.onInformativeAdd)
        OWGUI.button(vbox5, self, "<", self.onInformativeRemove)        
        self.informativeTags = []
        self.informativeTagsSelected = []
        OWGUI.listBox(hbox5, self, "informativeTagsSelected", "informativeTags")
        app = OWGUI.button(ca, self, "Apply", self.apply)
        self.catDoc = False
        chBox  = OWGUI.checkBox(col3, self, 'catDoc', label = 'Output category-word', box = '')
        
        gl.addMultiCellWidget(col3, 1, 3,  2, 2)
        gl.addWidget(app, 4, 2)
        
        self.resize(1200, 700)
        
    def openFile(self, fPath):
        self.listView.clear()
        #self.textEdit.clear()
        self.textEdit.setText("")
        self.listBoxTags.clear()
        f = open(fPath, "r")
        
        h = XMLEcho(self.listView)
        parser = make_parser()
        parser.reset()
        parser.setContentHandler(h)
        parser.parse(f)        
        f.close()
        
        self.listTags = h.tags[:]
        
    def browseFile(self, inDemos=0):                
        startfile = "."
        filename = str(QFileDialog.getOpenFileName(startfile,
        'XML files (*.xml)\nAll files(*.*)',None,'Open Orange XML File'))
    
        self.fileNameLabel.setText(filename)
        if filename == "": return
        self.openFile(filename)        
        
    def fillText(self, lvi):
        if hasattr(lvi, "myText"):
            self.textEdit.setText(lvi.myText)
        else:
            self.textEdit.setText("")
            
    def onContentAdd(self):
        if not len(self.listTagsSelected):
            return
        self.contentTag = self.listTags.pop(self.listTagsSelected[0])                
        self.listTagsSelected = []
        self.listTags = self.listTags[:]
    def onContentRemove(self):
        if self.contentTag:
            self.listTags.append(self.contentTag)
            self.contentTag = ""
            self.listTags = self.listTags            
            
    def onCategoryAdd(self):
        if not len(self.listTagsSelected):
            return
        self.categoryTag = self.listTags.pop(self.listTagsSelected[0])                
        self.listTagsSelected = []
        self.listTags = self.listTags[:]
    def onCategoryRemove(self):
        if self.contentTag:
            self.listTags.append(self.categoryTag)
            self.categoryTag = ""
            self.listTags = self.listTags[:]  
                
    def onInformativeAdd(self):
        if not len(self.listTagsSelected):
            return
        self.informativeTags.append(self.listTags.pop(self.listTagsSelected[0]))
        self.listTagsSelected = []
        self.listTags = self.listTags[:]
        self.informativeTags = self.informativeTags[:]
    def onInformativeRemove(self):
        if len(self.informativeTagsSelected):
            self.listTags.append(self.informativeTags.pop(self.informativeTagsSelected[0]))
            self.informativeTagsSelected = []
            self.listTags = self.listTags[:]
            self.informativeTags = self.informativeTags[:]
      
      
    def apply(self):
        tags = {
                        "document" : self.documentTag and self.documentTag or "document",
                        "content" : self.contentTag and self.contentTag or "content",
                        "categories" : self.categoriesTag and self.categoriesTag or "categories",
                        "category" : self.categoryTag and self.categoryTag or "category",
                    }
        if self.lemmatizer == '(none)':
            lem = lemmatizer.NOPLemmatization()
        else:
            lem = lemmatizer.FSALemmatization(os.path.join(str(orangedir), 'OrangeWidgets', 'TextData', self.lemmatizer).encode('latin_1'))
        if not self.stopwords == '(none)':
            for word in loadWordSet(os.path.join(str(orangedir), 'OrangeWidgets', 'TextData', self.stopwords).encode('latin_1')):
                lem.stopwords.append(word)
        a = TextCorpusLoader(str(self.fileNameLabel.text()), tags, self.informativeTagsSelected, lem)
        if self.catDoc:
            self.send("Documents", CategoryDocument(a.data).dataCD)
        else:
            self.send("Documents", a.data)
if __name__=="__main__": 
    import os
    os.chdir('/home/mkolar/Docs/Diplomski/repository/orange/')
    appl = QApplication(sys.argv) 
    ow = OWTextFile() 
    appl.setMainWidget(ow) 
    ow.show() 
    appl.exec_loop()            

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -