⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 postingcrawler.py

📁 Harvestman-最新版本
💻 PY
字号:
#!/usr/bin/env python"""postingcrawler.py - Demonstrating custom crawler writing bysubscribing to events. This is a crawler which crawlsBangPyper archives, finds job postings and posts it toa specific blog.Created by Anand B Pillai <abpillai at gmail dot com> Copyright (C) 2008 Anand B Pillai"""import __init__from harvestman.apps.spider import HarvestManimport sysimport bloggerimport getpassclass JobPostingCrawler(HarvestMan):    """ A job-posting crawler by integrating HarvestMan    with the google blogger API """    def __init__(self):        self.jobs = {}        super(JobPostingCrawler, self).__init__()            def after_parse_cb(self, event, *args, **kwargs):                document = event.document        url = event.url        if document:            content = document.content.lower()            title = document.title.lower()                        if title.find('job') != -1:                # If this is a reply to an original job post, ignore it...                data = document.content                                idx = data.find('Previous message:')                if idx != -1:                    idx2 = data.find('</A>', idx)                    # print 'String=>',data[idx:idx2]                    # print 'Title=>',document.title                    if not data[idx:idx2].endswith(document.title):                        self.jobs[document.title] = (url.get_full_url(), document.content)                else:                    self.jobs[document.title] = (url.get_full_url(), document.content)                        def finish_event_cb(self, event, *args, **kwargs):        if len(self.jobs):            print 'Found %d job postings' % len(self.jobs)            go_ahead = raw_input("Go ahead with posting [y/n] ? ")            if go_ahead.lower().strip() == 'y':                print 'Logging to blogger...'                username = raw_input("Enter username: ").strip()                passwd = getpass.getpass("Enter password: ").strip()                b = blogger.BlogJobPoster(username, passwd)                print 'Successfully logged into blogger.'                count = 0                content = ''                for title, (url,data) in self.jobs.items():                    # Extract date from the text...                    print data                    go_ahead = raw_input("Post this JOB [y/n] ? ")                    if go_ahead.lower().strip() != 'y': continue                                        count += 1                    date = data[data.find('<I>')+3:data.find('</I>')]                    title += '(Posted on: %s)' % date                    content = data[data.find('<PRE>')+5:data.find('</PRE>')] + '<br>\n'                    # Wrap content to text width nicely                    # content = wrap(content) + '<br>'                    content += 'Referrering URL: <i><a href="%s">%s</a></i>\n' % (url, url)                    content = '<P>' + content + '</P>'                    print 'Posting job ID %d for %s' % (count, url)                    b.post_job(title, content)            else:                print 'Found %d jobs, but did not post.' % len(self.jobs)        else:            print 'No job postings found!'if __name__ == "__main__":    spider=JobPostingCrawler()    spider.initialize()    config = spider.get_config()    config.verbosity = 3    config.robots = 0    config.localise = 0    spider.bind_event('afterparse', spider.after_parse_cb)    spider.bind_event('beforefinish', spider.finish_event_cb)        spider.main()

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -