# Pyubcookie 0.1 # A transparently pubcookie-aware http library for Python/Twisted import os.path from urllib import urlencode from twisted.internet import reactor from twisted.web import client, error from BeautifulSoup import BeautifulSoup PUBCOOKIE_ROOT = 'https://weblogin.washington.edu/' PUBCOOKIE_DOTFILE = os.path.expanduser('~/.pubcookie') # UW's weblogin has a user-agent whitelist, so we're forced to lie here DEFAULT_AGENT = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.1) Gecko/20061010 Firefox/2.0' def getPossibleHosts(host): segments = host.split('.') for i in xrange(len(segments)-1, -1, -1): yield '.'.join(segments[i:]) class HTTPClientFactory(client.HTTPClientFactory): def __init__(self, *args, **kwargs): if not kwargs.has_key('agent'): kwargs['agent'] = DEFAULT_AGENT if kwargs.has_key('cookieCB'): # kludge ahoy! get a typeerror when declaring cookieCB=None in parameters, but this works self.cookieCB = kwargs['cookieCB'] del kwargs['cookieCB'] else: self.cookieCB = None kwargs['followRedirect'] = 0 client.HTTPClientFactory.__init__(self, *args, **kwargs) def gotHeaders(self, headers): self.response_headers = headers if headers.has_key('set-cookie'): for cookie in headers['set-cookie']: cookparts = cookie.split(';') cook = cookparts[0] # XXX todo: cookie expiration k, v = cook.split('=', 1) k = k.lstrip() v = v.lstrip() if self.cookieCB: self.cookieCB(k, v) self.cookies[k] = v class HTTPSession(object): """A HTTP 'session' that keeps track of cookies set between calls to request()""" def __init__(self): self.cookies = {} def storeCookie(self, host, key, val): if not self.cookies.has_key(host): self.cookies[host] = {} self.cookies[host][key] = val; def getCookies(self, host): cookies = {} for i in getPossibleHosts(host): if self.cookies.has_key(i): cookies.update(self.cookies[i]) return cookies def handleRedirect(self, f, oldurl): f.trap(error.PageRedirect) newurl = f.value.location (oldscheme, oldhost, oldport, oldpath) = client._parse(oldurl) (newscheme, newhost, newport, newpath) = client._parse(newurl) scheme = newscheme or oldscheme host = newhost or oldhost port = newport or oldport if not newhost: if newpath[0] == '/' or '/' not in oldpath: path = newpath else: path = oldpath[:oldpath.rfind('/')+1] + newpath else: path = newpath finalurl = "%s://%s:%d%s" % (scheme, host, port, path) return self.request(finalurl) def request(self, url, postdata=None, headers={}): if isinstance(postdata, dict): postdata = urlencode(postdata) headers['Content-type']= 'application/x-www-form-urlencoded' scheme, host, port, path = client._parse(url) method = postdata is not None and 'POST' or 'GET' cookies = self.getCookies(host) cookieCB = lambda *a: self.storeCookie(host, *a) factory = HTTPClientFactory(url, method=method, postdata=postdata, headers=headers, cookies=cookies, cookieCB=cookieCB) if scheme == 'https': from twisted.internet import ssl contextFactory = ssl.ClientContextFactory() reactor.connectSSL(host, port, factory, contextFactory) else: reactor.connectTCP(host, port, factory) return factory.deferred.addErrback(self.handleRedirect, url) class PubcookieSession(HTTPSession): def __init__(self, username=None, password=None, root=PUBCOOKIE_ROOT, dotfile=PUBCOOKIE_DOTFILE, *args, **kwargs): self.root = root if username is not None and password is not None: self.username = username self.password = password elif not self.readDotfile(dotfile): raise ValueError('No pubcookie credentials given') super(PubcookieSession, self).__init__(*args, **kwargs) def readDotfile(self, filename): try: f = open(filename, 'r') for line in f: (key, val) = line.split(None, 1) key = key.lower() val = val.rstrip() if key in ('root', 'username', 'password'): self.__dict__[key] = val f.close() return True except IOError, e: if e.errno == 2: # file not found return False else: raise def trapPubcookieRedirect(self, page): if ('action="%s"' % self.root) in page: return self.handlePubcookieRedirect(page) else: return page def handlePubcookieRedirect(self, page): soup = BeautifulSoup(page) form = soup.find('form', action=self.root) if not form: # false positive; this is not a pubcookie redirect return page else: url = str(form['action']) postargs = dict(map(lambda el: (str(el['name']), str(el['value'])), form.findAll('input'))) return self.request(url, postargs) def handlePubcookieLogin(self, page): soup = BeautifulSoup(page) form = soup.find('form') if not form: raise ValueError, "got pubcookie login page without a login form" else: url = str(form['action']) postargs = dict([(str(el['name']), str(el['value'])) for el in form.findAll('input') if el.has_key('value')]) if url == self.root: postargs['user'] = self.username postargs['pass'] = self.password return self.request(url, postargs) def request(self, url, *args, **kwargs): req = super(PubcookieSession, self).request(url, *args, **kwargs) if url == self.root: return req.addCallback(self.handlePubcookieLogin) else: return req.addCallback(self.trapPubcookieRedirect) if __name__ == '__main__': from twisted.python import log import sys log.startLogging(sys.stdout) sess = PubcookieSession() sess.request('https://myuw.washington.edu/servlet/user').addCallbacks(log.msg, log.msg) reactor.run()