page_saver_server.py

#!/usr/bin/python

import BaseHTTPServer
import SocketServer
import base64

import os
import time

class PageServerRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
    #def __init__(self, request, client_address, server):
    #    BaseHTTPServer.BaseHTTPRequestHandler.__init__(request, client_address, server)

    def log_event(self, kind, url):
        print >> open(self.server.logfile,"a"), kind, url

    def do_GET(self):
        self.send_response(200, "OK")
        self.end_headers()
        if self.path == "/tags":
            print >> self.wfile, "\n".join(self.server.clipstash.tags())
        elif self.path == "/previous_urls":
            print >> self.wfile, "\n".join(previous_urls())
    def do_POST(self):
        # print "PATH:", self.path
        if self.path == "/url_seen":
            return self.do_POST_url_seen()
        if self.path == "/url_save_all":
            return self.do_POST_url_save_all()
        if self.path == "/clip_this":
            return self.do_POST_clip_this()
        self.send_response(404, "Not Found")
        self.end_headers()
        self.wfile.write("%s is bogus" % self.path)

    def do_POST_clip_this(self):
        info = {}
        for k in ['ClipURL', 'ClipCategory', 'ClipSelection', 'ClipTitle', 'ClipReferrer']:
            info[k] = self.headers.getheader(k)
        length = int(self.headers.getheader("Content-Length"))
        info["ClipHTML"] = base64.decodestring(self.rfile.read(length))
        self.send_response(200, "OK")
        self.end_headers()
        self.wfile.write("")
        print "GOT STUFF:"
        for k in ['ClipURL', 'ClipCategory', 'ClipSelection', 'ClipTitle', 'ClipReferrer']:
            print k, info[k]
        ### now just have to stash it somewhere...
        self.server.clipstash.record(info)
        ### could use pyosd.display() but it can't do "middle" or "center"
        os.system("echo '%s' | osd_cat -p middle -f '-*-new century schoolbook-bold-i-*-*-24-*-*-*-*-*-*-*' -A center -c orange -l 2 -s 1 &" % "Recorded")

    def do_POST_url_seen(self):
        # print "PATH:", self.path
        url = self.headers.getheader("SaveURL")
        # print "URL:", url
        event = self.headers.getheader("EventType")
        # print "EventType:", event
        self.send_response(200, "OK")
        self.end_headers()
        self.wfile.write("")
        self.log_event(event, url)
    def do_POST_url_save_all(self):
        # test with:
        # echo http://www.thok.org/ | curl -d@- http://localhost:3382/url_save_all
        length = int(self.headers.getheader("Content-Length"))
        urls = self.rfile.read(length)
        print >> open(urllog_file, "a"), "---", time.ctime()
        print >> open(urllog_file, "a"), urls
        self.send_response(200, "OK")
        self.end_headers()
        self.wfile.write("")
    def log_request(self, code=None, size=None):
        # this is just to make the default per-requst logs shut up
        pass
        
urllog_file = os.path.expanduser("~/.urllog")
def previous_urls():
    # cheap forward scan
    urls = []
    asof = ""
    for line in file(urllog_file):
        if not line.strip(): continue
        if line.startswith("undefined"):
            line = line.replace("undefined", "", 1)
        if line.startswith("--- "):
            asof = line.strip().replace("--- ","", 1)
            urls = []
            continue
        urls.append(line.strip())
    print len(urls), "urls retrieved as of", asof
    return urls

class ForkingHTTPServer(SocketServer.ForkingMixIn, BaseHTTPServer.HTTPServer):
  pass

import os
def deepmkdir(d):
    if not os.path.isdir(d):
        deepmkdir(os.path.dirname(d))
        os.mkdir(d)

import time
import datetime

import urlparse
def crunch_url(url):
    scheme, netloc, path, params, query, fragment = urlparse.urlparse(url.lower())
    return (netloc.replace("www.","").replace(".com","") 
            + "_" +
            path.split("/")[-1].replace(".html","").replace(".htm",""))

import string
import urllib
squash_whitespace = string.maketrans(string.whitespace, " " * len(string.whitespace))
def line_safe_unquote(s):
    # given encodeURIComponent, make something more
    # greppable but still rfc822-safe
    if not s: return ""
    return urllib.unquote(s).translate(squash_whitespace)

import sets
badpunct = sets.Set()
dotpunct = sets.Set()
badpunct.update(string.punctuation)
dotpunct.update("+-=@_%")
squash_chars = "".join(badpunct - dotpunct) + string.whitespace
squash_punctuation = string.maketrans(squash_chars, "." * len(squash_chars))
def crunch_words(txt):
    txt = txt.replace("%20","_")
    txt = txt.replace("%0A","_")
    txt = urllib.unquote(txt)
    txt = txt.translate(squash_punctuation)
    txt = txt.replace("..",".") # don't actually try too hard
    txt = txt.replace("..",".")
    txt = txt.replace("__","_") # don't actually try too hard
    txt = txt.replace("__","_")
    txt = txt.replace("_._",".")
    # consider a split_words -> SplitWords transform here
    return txt[:45]

class Stash:
    def __init__(self, basedir):
        self.base = basedir
    def tags(self):
        return [t.rstrip() for t in open(os.path.join(self.base,"tags")).readlines()]
    def record(self, info):
        instant = time.time()
        when = datetime.date.fromtimestamp(instant)
        thisdir = os.path.join(self.base,
                               when.strftime("%Y-%m"),
                               when.strftime("%Y-%m-%d"))
        deepmkdir(thisdir)
        this_item_parts = [time.strftime("%Y%m%d-%H%M%S", time.localtime(instant))]
        if "ClipCategory" in info:
            this_item_parts.append(info["ClipCategory"])
        if "ClipURL" in info:
            this_item_parts.append(crunch_url(info["ClipURL"]))
        if "ClipSelection" in info:
            this_item_parts.append(crunch_words(info["ClipSelection"]))
        elif "ClipTitle" in info:
            this_item_parts.append(crunch_words(info["ClipTitle"]))
        this_item_parts.append("clip")
        this_item = ".".join(this_item_parts)
        this_item_path = os.path.join(thisdir, this_item)
        # marshal info into it...
        item = open(this_item_path, "w")
        # rfc822-ish...
        for k,v in info.items():
            if k == "ClipHTML": continue
            if k == "ClipSelection":
                print >>item, "%s: %s" % (k,line_safe_unquote(v))
            else:
                print >>item, "%s: %s" % (k,v)
        print >>item, ""
        print >>item, info.get("ClipHTML", "")
        item.close()

import sys,os
if __name__ == "__main__":
    server = ForkingHTTPServer(("localhost", 3382), PageServerRequestHandler)
    server.logfile = os.path.join(os.path.dirname(sys.argv[0]), "pagelog")
    server.clipstash = Stash(os.path.expanduser("~/stufflog"))
    server.serve_forever()