#!/usr/bin/python import urllib import re # urllib.URLopener.version = "(compatible; MSIE 4.01; MSN 2.5; AOL 4.0; Windows 98)" urllib.URLopener.version = "Lynx/2.8.4rel.1; libwww-FM/2.14; OpenSSL/0.9.7c" def cookiedict(cookies): return dict([c.split("=",1) for c in cookies]) def misquote(s): # return s.replace("/", "%2F").replace("=", "%3D").replace("%3D", "=", 1) return s import sys def get_page_and_cookie(url, cookies = [], data = None): print >> sys.stderr, "GETTING:", url uo = urllib.FancyURLopener() # print "uah:", uo.addheaders if cookies: uo.addheader("Cookie", "; ".join(cookies)) # print "uah:", uo.addheaders[-1] # print "uah:", uo.addheaders u = uo.open(url, data) newcookies = [] if u.headers.has_key("Set-Cookie"): # print "set-cookies:", repr(u.headers["Set-Cookie"]) # print "other headers:", repr(u.headers.keys()) stripped_cookies = re.sub("expires=\w+, [^,]+","", u.headers["Set-Cookie"]).split(", ") newcookies = [cookie.split("; ")[0] for cookie in stripped_cookies] # print "NewCookies!", newcookies doc = u.read() u.close() # copy old cookies forward cd = cookiedict(cookies) ncd = cookiedict(newcookies) cd.update(ncd) return doc, ["%s=%s" % (k,v) for k,v in cd.items()] def urlbase(s): """Assuming a canonical-form url, return the basename part""" # surprisingly, there isn't a splitter like this in urllib return "/".join(lastpageurl.split("/")[0:3]) + nextpageurl