#!/usr/bin/python

# inspired by
#    get_recommendations.pl (http://hacks.oreilly.com/pub/h/981)
# but instead, gets the books I allegedly own, so that I can feed them
# to thokbook or Booxter.

# Note that the perl script above gets the handling of "checked" entirely wrong,
# and I'd be surprised if it works at all - fixing that was a key part of making
# this work.

import sys
import urllib
import re

import urlextra

# I'd been using re.findall()[0], but that's kind of ugly and "IndexError" isn't
# what I wanted for the no-result case.
def re_findone(pat, input):
    res = re.findall(pat, input)
    if res: return res[0]
    return None

# Given the content of a form, and data that we want to fill in, mash it
# all together to get the appropriate posting arguments.
#
# cheat a lot - since amazon (among others) uses wretched html, using a
#  real xml or sgml parser would only get in our way.

def get_form_fields(loginform, form_arguments):
    thisdata = {}
    # look for each <input> tag (all that matters, really.)
    for inputargs in re.findall("<input ([^>]*?)>", loginform):
        thisinput = {}
        # just split up the tags and make a tag/value dictionary.
        for attr in inputargs.split(" "):
            if attr.count("="):
                k,v = attr.split("=")
                thisinput.update(dict([[k,v]]))
            else:
                # "checked" and "unchecked" (and probably some others we
                # haven't run into yet) don't have values.  Turns out we
                # don't really care about them, either.
                if attr:
                    if attr not in ["checked", "unchecked"]: # handled below
                        print >> sys.stderr, "EXTRA:", attr
        # attributes might not be quoted; just normalize them.
        ttype = thisinput["type"].strip('"')
        # The current script doesn't ever need to select radio or checkboxes,
        # it just copies through their defaults.  hidden values likewise,
        # since their whole point is to provide values to copy through.
        if ttype in ["hidden", "radio", "checkbox"]:
            # again strip the quotes
            # we'd be unhappy if any values actually had spaces, but so far we win
            tname = thisinput["name"].strip('"')
            tval  = thisinput["value"].strip('"')
            if ttype in ["radio", "checkbox"]:
                # looking back at the input is easier than keeping the flags around
                # hidden always copies, but the others only copy if they're preselected
                if " checked" not in inputargs:
                    continue
            thisdata[tname] = tval
        elif ttype == "image":
            # image items are used to decorate the action button, aren't real
            pass
        else:
            # everything else is a fill-in-field: hope we have something to fill in
            tname = thisinput["name"].strip('"')
            if tname in form_arguments:
                thisdata[tname] = form_arguments[tname]
            else:
                # but if we didn't supply something, it can just be default/blank
                print >> sys.stderr, "Couldn't figure out what to do with", inputargs
    data = urllib.urlencode(thisdata)
    return data

if __name__ == "__main__":
    # user gives their account info on the command line
    prog, email, password = sys.argv
    form_arguments = {"email": email, "password": password}

    # start with the top level page, no cookies
    doc, cookie = urlextra.get_page_and_cookie("http://www.amazon.com")
    # find the Your Account button
    aalog = "http://www.amazon.com" + re_findone('href="(/exec/obidos/account-access-login/[^"]*)', doc)
    aadoc, cookie = urlextra.get_page_and_cookie(aalog,cookie)

    # find either "Improve your Recommendations" or "Edit your Recommendations"
    improvebase = re_findone('<a href="([^"]*)">Edit your Recommendations',aadoc)
    if not improvebase:
        improvebase = re_findone('<a href="([^"]*)">Improve your Recommendations',aadoc)
    improvelog = "http://www.amazon.com" + improvebase
    impdoc, cookie = urlextra.get_page_and_cookie(improvelog, cookie)


    # find flex-sign-in-done and the login form
    cc = re.compile("form action=([^ ]*?flex-sign-in-done/[^ ]*?) (.*?)</form>", re.DOTALL)
    loginurl, loginform = re_findone(cc, impdoc)
    data  =  get_form_fields(loginform, form_arguments)
    edit_booklist_doc, cookie = urlextra.get_page_and_cookie(loginurl, cookie, data)

    # if we're in the Improve->Edit path, get the edit page again
    eyc = re_findone("<a href=([^>]*)>Edit Your Collection", edit_booklist_doc)
    if eyc:
        loginurl = eyc
        edit_booklist_doc, cookie = urlextra.get_page_and_cookie(loginurl, cookie)

    # note that on one of the paths, we still have to look for "all items you own"
    # instead of the default "unrated" set
    all_items = re_findone('<a href="([^"]*)">All items you own</a>', edit_booklist_doc)
    if all_items:
        loginurl = all_items
        edit_booklist_doc, cookie = urlextra.get_page_and_cookie(loginurl, cookie)

    # SHOULD insert a "select books only" button here, but it may be easier
    # to use the icons for filtering to make this a get-my-dvd's and -cds' tool
    # too.

    lastpageurl = loginurl
    while 1:
        # look for all hrefs after is-a-book icons
        collcc = re.compile('icons/icon-books.gif.*?\n.*?<a href=([^>]*?)>([^<]*?)<')
        icons = re.findall(collcc, edit_booklist_doc)

        # actually do the work we came here for...
        for uri, title in icons:
            asinval = uri.split("/")[uri.split("/").index("ASIN") + 1]
            print asinval, title
        
        # string.count is smoother to use than string.find
        if not edit_booklist_doc.count('value="Save & Continue"'):
            if edit_booklist_doc.count('value="Save item"'):
                print >> sys.stderr, "found end of list"
                break
            else:
                # oops? dump enough info to see how the interface changed
                print "couldn't find *either* button!", edit_booklist_doc

        # now look at the page again to find the "save & continue"
        # which turns out to be a form that posts all of the recommendation settings
        # from the page.  Filtering them out doesn't appear to break anything,
        # and one could use this to automate recommendations too...
        genformcc = re.compile("<form method=post action=([^>]*?/[^>]*?)>(.*?)</form>", re.DOTALL)
        nextpageurl, nextpageform = re_findone(genformcc, edit_booklist_doc)
        nextpagedata = get_form_fields(nextpageform, form_arguments)
        nextpageurl = urlextra.urlbase(lastpageurl) + nextpageurl
        edit_booklist_doc, cookie = urlextra.get_page_and_cookie(nextpageurl, cookie, nextpagedata)
        lastpageurl = nextpageurl

