⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cookiemgr.py

📁 网络蜘蛛
💻 PY
字号:
"""
HarvestManCookie.py - Module to implement a basic CookieManager for HarvestMan.
This file is part of the HarvestMan program.

Description
===========

Manages Cookie Persistance for a domain from User Agent Perspective
Targets RFC 2109. (Very early stages of development)


Author : Nirmal C <nkchidambaram at yahoo dot com>

License
=======
Licensed under Open Software License. Refer LICENSE.txt file for details.

Copyright
=========

Copyright (C) 2003-2004  Nirmal K Chidambaram & Anand Pillai 

History
======

Anand Sep 02 2003 Mod       Added error checks for anydbm file opening.
                            Added error checks for dbm variable.
                            Added some util functions. Added a test
                            using our connector.
Sep 04 2003        Anand    1.2alpha release.
Jan  2 2004        Anand    1.4 bug fix version development started.
Feb  11 2004       Anand    Fixed a bug with cookie manager (spelling
                            mistake in function name). There is no bug
                            id for this.

Jun 14 2004         Anand          1.3.9 release.                            
                            
TODO
====
We need to replace HTTPRedirectHandler of urllib2 with our own
and add it using OpenerDirector for full RFC 2109 compliance.

"""

from Cookie import SimpleCookie
from Cookie import CookieError
import Cookie
import shelve
import cPickle
import re

from threading import Lock, Condition
from urllib2 import Request

from common import *

class CookieManager:
    """ Acts as a CookieManager for a domain , stores retrieves cookies """

    def __init__(self, cookiestore=None):
        self.__internal_cookie = SimpleCookie()

        if cookiestore is None:
            self.__cookiestore = DBMCookieStore('harvestman_cookies.dat')
        else:
            self.__cookiestore = cookiestore

        self._netscape_domain = False
        self.domain_re = re.compile(r"[^.]*")
        self.ipv4_re = re.compile(r"\.\d+$")

    def startswith(str , initial):
        if len(initial) > len(str): return 0
        return string[:len(initial)] == initial

    def __validate_rules_rfc2109(self,temp_cookie,reqhost):
        """ Validates Rules for Cookie as per RFC 2109 """

        try:

            for key in temp_cookie.keys():
                morsel = temp_cookie[key]

                mor = morsel['domain']
                if mor=='':
                    continue
                
                # Rule 1: Validate Domain , if Domain has problems reject cookie
                if  not (mor[0] != '.') or (mor[1:len(mor)-1].find('.')!= -1):
                    del temp_cookie[key]

            # If contains at least one Morsel , return it else return none
            if  len(temp_cookie):
                return temp_cookie
            else:
                return None

        except CookieError:
            print 'Cookie Parsing Error'

    def __validate_rules_rfc2965(self,temp_cookie,reqhost):
        """ Validate rules for cookie as per RFC 2965 """

        # TODO
        pass

    def close_session(self):
        self.__cookiestore.close()

    def set_cookie(self,header,reqhost):
        """ Pushes Cookie to datastore after validating all rules """

        temp_cookie = Cookie.SimpleCookie()

        # We need to take care of lower case strings too
        for k in header.keys():
            if k.lower() == 'set-cookie':
                temp_cookie.load(header[k])
                validate_rules = self.__validate_rules_rfc2109
            elif k.lower() == 'set-cookie2':
                temp_cookie.load(header[k])
                validate_rules = self.__validate_rules_rfc2965

        if validate_rules(temp_cookie,reqhost):
            return self.__push_to_data_store(validate_rules(temp_cookie,reqhost),reqhost)
        else:
            return 0

    def __push_to_data_store(self,cookie,reqhost):
        """ Pushes cookies to data store """

        return  self.__cookiestore.store(cookie,reqhost)

    def get_next_domain(self, domain):
        """ Return a domain from passed domain, by stripping
        of leading characters before a dot """

        # Added anand 02 Sep 2003
        # The domain string must contain at least one dot.
        # Egs: a.b.c.net => .b.c.net => b.c.net => .c.net

        # !!DONT USE => NOT WORKING !!
        if self.startswith(domain, "."):
            domain = domain[1:]
            self._netscape_domain = True
        else:
            domain = self.domain_re.sub("", domain, 1)
            self._netscape_domain = False

        return domain

    def add_cookie_header(self, request, domain):
        """ Add cookie headers to a request """

        # Added Anand Sep 02 2003

       ##  while string.find(domain, ".") != -1:
##             cookies = self.GetCookie(domain)
##             if cookies is None:
##                 if self.ipv4_re.search(domain):
##                     # if the domain is actually an ip address, break
##                     break

##                 prevdomain=domain
##                 domain = self.getNextDomain(domain)
##                 # if same domain, break
##                 if domain==prevdomain: break
        
        cookies=self.get_cookie(domain)
        if cookies is None: return
        
        for c in cookies:
            request.add_header("Cookie", c.output(header="Cookie:"))

    def get_cookie(self, reqhost):
        """ Retrieves Cookie from data store of Cookie Manager """

        lst =  self.__cookiestore.retrieve(reqhost)
        return self.__apply_retr_rules(lst, reqhost)

    def __apply_retr_rules(self,lst,reqhost):
        """ Apply all retrieval rules - TODO """

        return lst


class AbstractCookieStore:
    """ This class poses as an abstract Cookie store """

    def __init__(self):
        self.__dict__={}

    def __parse_domain_from_reqhost(self,reqhost):
        """ Parses reqhost and gets domain out of it as per RFC 2109 """

        if reqhost.find('http://') == 0:
            reqhost = reqhost[7:]
        if reqhost.find('www') == 0:
            reqhost = reqhost[4:]
        if (reqhost.rfind('/') != -1):
            reqhost = reqhost[:reqhost.rfind('/')]
        return reqhost

    def __get__domain_key(self,cookie,reqhost):
        """ Gets domain, path from cookie , adds up and returns """

        domain = self.__parse_domain_from_reqhost(reqhost)        
        path = '/'

        # checks if there is domain and path settings for cookie
        for key in cookie.keys():
            if cookie[key].has_key('domain'):
                domain = cookie[key]['domain'][1:]
            if cookie[key].has_key('path'):
                path = cookie[key]['path']
            break

        return domain+path      


    def get_hash_to_store(self,cookie,reqhost):
        """ Get the hash value for storage of the cookie """

        # Push to DBM Picked Cookie , keyed by  domain + path
        # Iterate oever cookie and get Morsels , Consider each Morsel as a seperate
        # Cookie and call __get__domain_key Function ,
        # store Morsel aka Cookie in Hash with that key 

        hash_store = {}

        for ckey in cookie.keys():
            new_cook = Cookie.SimpleCookie()
            new_cook.load(cookie[ckey].output())
            key = self.__get__domain_key(new_cook,reqhost)

            if hash_store.has_key(key):
                to_be_added = Cookie.SimpleCookie()
                pickled = cPickle.loads(hash_store[key])
                to_be_added.load(new_cook.output() + ',' + pickled.output())
                hash_store[key] = cPickle.dumps(to_be_added)               
            else:
                hash_store[key] = cPickle.dumps(new_cook)

        return hash_store

    def retrieve_domains(self, reqpath):

        lst = []
        if reqpath.find('http://') == 0:
            reqpath = reqpath[7:]
        if reqpath.find('www') == 0:
            reqpath = reqpath[4:]

        # if uri just points to a domian
        if reqpath.find('/') == -1:
            lst.append(reqpath+'/')

        curpath = -5

        while(curpath != -1):
            if curpath != -5:
                curpath = reqpath.find('/',curpath+1)              
            else:
                curpath = reqpath.find('/',0)               

            if(curpath != -1):
                lst.append(reqpath[:curpath+1])                

            return lst


class DBMCookieStore(AbstractCookieStore):
    """ Persistance for cookies using shelve. (This class is thread safe) """

    def __init__(self,filename):
        """ Initialize """

        self.__filename = filename
        self.__open_data_store()
        self.lck = Condition(Lock())

    def __open_data_store(self):
        """ Open the dbm for writing """

        # Added error checks - Anand
        import os
        
        self.db=None
        
        debug('DB filename => ', self.__filename)
        
        try:
            if os.path.exists(self.__filename):
                # if the filesize is zero, remove it
                # otherwise dbm cribs.
                st=os.stat(self.__filename)
                if st.st_size==0:
                    try:
                        os.chmod(self.__filename, 0777)
                        os.remove(self.__filename)
                    except OSError, e:
                        print e

            if not os.path.exists(self.__filename):
                # creat if does not exist
                self.db = shelve.open(self.__filename, 'c')
            else:
                # otherwise open for read/write
                self.db = shelve.open(self.__filename, 'w')
                
        except Exception, e:
            debug('Error opening DBM =>', e)

    def store(self,cookie,reqhost):
        """ Store a cookie in the database """

        if self.db is None: return -1
        
        debug('Storing cookie ', cookie)
        # Other Implementors *should* call Baseclass
        # store to get dictionary to store
        # Only one base class anyway :)
        for base in self.__class__.__bases__:
            hash_store = base.get_hash_to_store(self,cookie,reqhost)

        try:
            self.lck.acquire()
            # Store this hash_store to dbm
            for key in hash_store.keys():
                self.db[key] = hash_store[key]
        finally:
            self.lck.release()

        return 1

    def close(self):
        """ Close the database """

        if self.db is None: return -1
        
        try:
            self.lck.acquire()
            self.db.close()
            self.db=None
        except Exception, e:
            print 'Error closing DBM => ', e
        else:
            self.lck.release()

    def retrieve(self,reqpath):
        """ Retrieve cookies from the cookie store """

        if self.db is None: return None
        
        for base in self.__class__.__bases__:
            domains = base.retrieve_domains(self,reqpath)

        cookies = []

        self.lck.acquire()

        for key in domains:
            try:
                if self.db.has_key(key):
                    cookies.append(cPickle.loads(self.db[key]))
            except Exception, e:
                print e

        self.lck.release()
        return cookies

# Test cases
def test():

    print 'Testing Cookie Store'

    ## Create a Cookie Store , This Cookie Store stores all cookies in Shelve
    store = DBMCookieStore('cookie_test1.dbm') 

    ## Initialize Cookie Manager with DBM Cookie Store
    ckm = CookieManager(store)

    ## SetCookie , First Param - Dict of HTTP headers, here we are just passing Set-Cookie
    ## Second param   Request Path

    # Get cookie from www.playboy.com
    import HarvestManUrlConnector

    conn = HarvestManUrlConnector.harvestManUrlConnector()
    conn.initialize()

    # Proxy for anand
    # conn.set_proxy('192.168.254.19',  8080, (), False)
    # conn.configure()

    conn.urlopen('http://www.playboy.com')
    cookies = conn.get_cookies()

    ckm.set_cookie({'Set-Cookie': r' PREF=ID=7034efa97bf9ec91:LD=en:TM=1061212089:LM=1061212089:S=lgUuL_8uCsJuL025; expires=Sun, 17-Jan-2038 19:14:07 GMT; path=/; domain=.google.co.in , TRIM=hashashas; expires=Sun, 17-Jan-2038 19:14:07 GMT; path=/; domain=.google.co.in 'r' PREF=ID=7034efa97bf9ec91:LD=en:TM=1061212089:LM=1061212089:S=lgUuL_8uCsJuL025; expires=Sun, 17-Jan-2038 19:14:07 GMT; path=/; domain=.google.co.in , TRIM=hashashas; expires=Sun, 17-Jan-2038 19:14:07 GMT; path=/; domain=.google.co.in 'r' PREF=ID=7034efa97bf9ec91:LD=en:TM=1061212089:LM=1061212089:S=lgUuL_8uCsJuL025; expires=Sun, 17-Jan-2038 19:14:07 GMT; path=/; domain=.google.co.in , TRIM=hashashas; expires=Sun, 17-Jan-2038 19:14:07 GMT; comment = someco path=/foo/;  ','SomeHTTPheader': 'SomeValue'},'www.google.co.in')
    for cookie in cookies:
        print cookie
        ckm.set_cookie(cookie, 'www.playboy.com')

    print 'Cookies Stored'
    print 'Fetching Cookies - User Agent'

    # Get Cookies based on path Cookies are returned as Cookie.SimpleCookie objects in a list
    cokies = ckm.get_cookie('http://www.playboy.com/')

    for c in cokies:
        print c.output(header="Cookie:")

    cookies = ckm.get_cookie('http://www.google.co.in/foo/bar/index.html')
    for c in cookies:
        print c
        # print c.output(header="Cookie:")

    ## Close Session , this calls CookieStore close, DBMCookieStore use this opportunity to close DBM file so that
    ## everything is written to disk , Say if it is MySQLCookieStore , this method might close connection
    ckm.close_session()
    print ' Testing Cookie Store Done'  

if __name__ == "__main__":
    test()


⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -