#!/usr/bin/python
# inspired by
# get_recommendations.pl (http://hacks.oreilly.com/pub/h/981)
# but instead, gets the books I allegedly own, so that I can feed them
# to thokbook or Booxter.
# Note that the perl script above gets the handling of "checked" entirely wrong,
# and I'd be surprised if it works at all - fixing that was a key part of making
# this work.
import sys
import urllib
import re
import urlextra
# I'd been using re.findall()[0], but that's kind of ugly and "IndexError" isn't
# what I wanted for the no-result case.
def re_findone(pat, input):
res = re.findall(pat, input)
if res: return res[0]
return None
# Given the content of a form, and data that we want to fill in, mash it
# all together to get the appropriate posting arguments.
#
# cheat a lot - since amazon (among others) uses wretched html, using a
# real xml or sgml parser would only get in our way.
def get_form_fields(loginform, form_arguments):
thisdata = {}
# look for each tag (all that matters, really.)
for inputargs in re.findall("]*?)>", loginform):
thisinput = {}
# just split up the tags and make a tag/value dictionary.
for attr in inputargs.split(" "):
if attr.count("="):
k,v = attr.split("=")
thisinput.update(dict([[k,v]]))
else:
# "checked" and "unchecked" (and probably some others we
# haven't run into yet) don't have values. Turns out we
# don't really care about them, either.
if attr:
if attr not in ["checked", "unchecked"]: # handled below
print >> sys.stderr, "EXTRA:", attr
# attributes might not be quoted; just normalize them.
ttype = thisinput["type"].strip('"')
# The current script doesn't ever need to select radio or checkboxes,
# it just copies through their defaults. hidden values likewise,
# since their whole point is to provide values to copy through.
if ttype in ["hidden", "radio", "checkbox"]:
# again strip the quotes
# we'd be unhappy if any values actually had spaces, but so far we win
tname = thisinput["name"].strip('"')
tval = thisinput["value"].strip('"')
if ttype in ["radio", "checkbox"]:
# looking back at the input is easier than keeping the flags around
# hidden always copies, but the others only copy if they're preselected
if " checked" not in inputargs:
continue
thisdata[tname] = tval
elif ttype == "image":
# image items are used to decorate the action button, aren't real
pass
else:
# everything else is a fill-in-field: hope we have something to fill in
tname = thisinput["name"].strip('"')
if tname in form_arguments:
thisdata[tname] = form_arguments[tname]
else:
# but if we didn't supply something, it can just be default/blank
print >> sys.stderr, "Couldn't figure out what to do with", inputargs
data = urllib.urlencode(thisdata)
return data
if __name__ == "__main__":
# user gives their account info on the command line
prog, email, password = sys.argv
form_arguments = {"email": email, "password": password}
# start with the top level page, no cookies
doc, cookie = urlextra.get_page_and_cookie("http://www.amazon.com")
# find the Your Account button
aalog = "http://www.amazon.com" + re_findone('href="(/exec/obidos/account-access-login/[^"]*)', doc)
aadoc, cookie = urlextra.get_page_and_cookie(aalog,cookie)
# find either "Improve your Recommendations" or "Edit your Recommendations"
improvebase = re_findone('Edit your Recommendations',aadoc)
if not improvebase:
improvebase = re_findone('Improve your Recommendations',aadoc)
improvelog = "http://www.amazon.com" + improvebase
impdoc, cookie = urlextra.get_page_and_cookie(improvelog, cookie)
# find flex-sign-in-done and the login form
cc = re.compile("form action=([^ ]*?flex-sign-in-done/[^ ]*?) (.*?)", re.DOTALL)
loginurl, loginform = re_findone(cc, impdoc)
data = get_form_fields(loginform, form_arguments)
edit_booklist_doc, cookie = urlextra.get_page_and_cookie(loginurl, cookie, data)
# if we're in the Improve->Edit path, get the edit page again
eyc = re_findone("]*)>Edit Your Collection", edit_booklist_doc)
if eyc:
loginurl = eyc
edit_booklist_doc, cookie = urlextra.get_page_and_cookie(loginurl, cookie)
# note that on one of the paths, we still have to look for "all items you own"
# instead of the default "unrated" set
all_items = re_findone('All items you own', edit_booklist_doc)
if all_items:
loginurl = all_items
edit_booklist_doc, cookie = urlextra.get_page_and_cookie(loginurl, cookie)
# SHOULD insert a "select books only" button here, but it may be easier
# to use the icons for filtering to make this a get-my-dvd's and -cds' tool
# too.
lastpageurl = loginurl
while 1:
# look for all hrefs after is-a-book icons
collcc = re.compile('icons/icon-books.gif.*?\n.*?]*?)>([^<]*?)<')
icons = re.findall(collcc, edit_booklist_doc)
# actually do the work we came here for...
for uri, title in icons:
asinval = uri.split("/")[uri.split("/").index("ASIN") + 1]
print asinval, title
# string.count is smoother to use than string.find
if not edit_booklist_doc.count('value="Save & Continue"'):
if edit_booklist_doc.count('value="Save item"'):
print >> sys.stderr, "found end of list"
break
else:
# oops? dump enough info to see how the interface changed
print "couldn't find *either* button!", edit_booklist_doc
# now look at the page again to find the "save & continue"
# which turns out to be a form that posts all of the recommendation settings
# from the page. Filtering them out doesn't appear to break anything,
# and one could use this to automate recommendations too...
genformcc = re.compile("", re.DOTALL)
nextpageurl, nextpageform = re_findone(genformcc, edit_booklist_doc)
nextpagedata = get_form_fields(nextpageform, form_arguments)
nextpageurl = urlextra.urlbase(lastpageurl) + nextpageurl
edit_booklist_doc, cookie = urlextra.get_page_and_cookie(nextpageurl, cookie, nextpagedata)
lastpageurl = nextpageurl