#!/usr/bin/python # inspired by # get_recommendations.pl (http://hacks.oreilly.com/pub/h/981) # but instead, gets the books I allegedly own, so that I can feed them # to thokbook or Booxter. # Note that the perl script above gets the handling of "checked" entirely wrong, # and I'd be surprised if it works at all - fixing that was a key part of making # this work. import sys import urllib import re import urlextra # I'd been using re.findall()[0], but that's kind of ugly and "IndexError" isn't # what I wanted for the no-result case. def re_findone(pat, input): res = re.findall(pat, input) if res: return res[0] return None # Given the content of a form, and data that we want to fill in, mash it # all together to get the appropriate posting arguments. # # cheat a lot - since amazon (among others) uses wretched html, using a # real xml or sgml parser would only get in our way. def get_form_fields(loginform, form_arguments): thisdata = {} # look for each tag (all that matters, really.) for inputargs in re.findall("]*?)>", loginform): thisinput = {} # just split up the tags and make a tag/value dictionary. for attr in inputargs.split(" "): if attr.count("="): k,v = attr.split("=") thisinput.update(dict([[k,v]])) else: # "checked" and "unchecked" (and probably some others we # haven't run into yet) don't have values. Turns out we # don't really care about them, either. if attr: if attr not in ["checked", "unchecked"]: # handled below print >> sys.stderr, "EXTRA:", attr # attributes might not be quoted; just normalize them. ttype = thisinput["type"].strip('"') # The current script doesn't ever need to select radio or checkboxes, # it just copies through their defaults. hidden values likewise, # since their whole point is to provide values to copy through. if ttype in ["hidden", "radio", "checkbox"]: # again strip the quotes # we'd be unhappy if any values actually had spaces, but so far we win tname = thisinput["name"].strip('"') tval = thisinput["value"].strip('"') if ttype in ["radio", "checkbox"]: # looking back at the input is easier than keeping the flags around # hidden always copies, but the others only copy if they're preselected if " checked" not in inputargs: continue thisdata[tname] = tval elif ttype == "image": # image items are used to decorate the action button, aren't real pass else: # everything else is a fill-in-field: hope we have something to fill in tname = thisinput["name"].strip('"') if tname in form_arguments: thisdata[tname] = form_arguments[tname] else: # but if we didn't supply something, it can just be default/blank print >> sys.stderr, "Couldn't figure out what to do with", inputargs data = urllib.urlencode(thisdata) return data if __name__ == "__main__": # user gives their account info on the command line prog, email, password = sys.argv form_arguments = {"email": email, "password": password} # start with the top level page, no cookies doc, cookie = urlextra.get_page_and_cookie("http://www.amazon.com") # find the Your Account button aalog = "http://www.amazon.com" + re_findone('href="(/exec/obidos/account-access-login/[^"]*)', doc) aadoc, cookie = urlextra.get_page_and_cookie(aalog,cookie) # find either "Improve your Recommendations" or "Edit your Recommendations" improvebase = re_findone('Edit your Recommendations',aadoc) if not improvebase: improvebase = re_findone('Improve your Recommendations',aadoc) improvelog = "http://www.amazon.com" + improvebase impdoc, cookie = urlextra.get_page_and_cookie(improvelog, cookie) # find flex-sign-in-done and the login form cc = re.compile("form action=([^ ]*?flex-sign-in-done/[^ ]*?) (.*?)", re.DOTALL) loginurl, loginform = re_findone(cc, impdoc) data = get_form_fields(loginform, form_arguments) edit_booklist_doc, cookie = urlextra.get_page_and_cookie(loginurl, cookie, data) # if we're in the Improve->Edit path, get the edit page again eyc = re_findone("]*)>Edit Your Collection", edit_booklist_doc) if eyc: loginurl = eyc edit_booklist_doc, cookie = urlextra.get_page_and_cookie(loginurl, cookie) # note that on one of the paths, we still have to look for "all items you own" # instead of the default "unrated" set all_items = re_findone('All items you own', edit_booklist_doc) if all_items: loginurl = all_items edit_booklist_doc, cookie = urlextra.get_page_and_cookie(loginurl, cookie) # SHOULD insert a "select books only" button here, but it may be easier # to use the icons for filtering to make this a get-my-dvd's and -cds' tool # too. lastpageurl = loginurl while 1: # look for all hrefs after is-a-book icons collcc = re.compile('icons/icon-books.gif.*?\n.*?]*?)>([^<]*?)<') icons = re.findall(collcc, edit_booklist_doc) # actually do the work we came here for... for uri, title in icons: asinval = uri.split("/")[uri.split("/").index("ASIN") + 1] print asinval, title # string.count is smoother to use than string.find if not edit_booklist_doc.count('value="Save & Continue"'): if edit_booklist_doc.count('value="Save item"'): print >> sys.stderr, "found end of list" break else: # oops? dump enough info to see how the interface changed print "couldn't find *either* button!", edit_booklist_doc # now look at the page again to find the "save & continue" # which turns out to be a form that posts all of the recommendation settings # from the page. Filtering them out doesn't appear to break anything, # and one could use this to automate recommendations too... genformcc = re.compile("
]*?/[^>]*?)>(.*?)
", re.DOTALL) nextpageurl, nextpageform = re_findone(genformcc, edit_booklist_doc) nextpagedata = get_form_fields(nextpageform, form_arguments) nextpageurl = urlextra.urlbase(lastpageurl) + nextpageurl edit_booklist_doc, cookie = urlextra.get_page_and_cookie(nextpageurl, cookie, nextpagedata) lastpageurl = nextpageurl