#!/usr/bin/python
import BaseHTTPServer
import SocketServer
import base64
import os
import time
class PageServerRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
#def __init__(self, request, client_address, server):
# BaseHTTPServer.BaseHTTPRequestHandler.__init__(request, client_address, server)
def log_event(self, kind, url):
print >> open(self.server.logfile,"a"), kind, url
def do_GET(self):
self.send_response(200, "OK")
self.end_headers()
if self.path == "/tags":
print >> self.wfile, "\n".join(self.server.clipstash.tags())
elif self.path == "/previous_urls":
print >> self.wfile, "\n".join(previous_urls())
def do_POST(self):
# print "PATH:", self.path
if self.path == "/url_seen":
return self.do_POST_url_seen()
if self.path == "/url_save_all":
return self.do_POST_url_save_all()
if self.path == "/clip_this":
return self.do_POST_clip_this()
self.send_response(404, "Not Found")
self.end_headers()
self.wfile.write("%s is bogus" % self.path)
def do_POST_clip_this(self):
info = {}
for k in ['ClipURL', 'ClipCategory', 'ClipSelection', 'ClipTitle', 'ClipReferrer']:
info[k] = self.headers.getheader(k)
length = int(self.headers.getheader("Content-Length"))
info["ClipHTML"] = base64.decodestring(self.rfile.read(length))
self.send_response(200, "OK")
self.end_headers()
self.wfile.write("")
print "GOT STUFF:"
for k in ['ClipURL', 'ClipCategory', 'ClipSelection', 'ClipTitle', 'ClipReferrer']:
print k, info[k]
### now just have to stash it somewhere...
self.server.clipstash.record(info)
### could use pyosd.display() but it can't do "middle" or "center"
os.system("echo '%s' | osd_cat -p middle -f '-*-new century schoolbook-bold-i-*-*-24-*-*-*-*-*-*-*' -A center -c orange -l 2 -s 1 &" % "Recorded")
def do_POST_url_seen(self):
# print "PATH:", self.path
url = self.headers.getheader("SaveURL")
# print "URL:", url
event = self.headers.getheader("EventType")
# print "EventType:", event
self.send_response(200, "OK")
self.end_headers()
self.wfile.write("")
self.log_event(event, url)
def do_POST_url_save_all(self):
# test with:
# echo http://www.thok.org/ | curl -d@- http://localhost:3382/url_save_all
length = int(self.headers.getheader("Content-Length"))
urls = self.rfile.read(length)
print >> open(urllog_file, "a"), "---", time.ctime()
print >> open(urllog_file, "a"), urls
self.send_response(200, "OK")
self.end_headers()
self.wfile.write("")
def log_request(self, code=None, size=None):
# this is just to make the default per-requst logs shut up
pass
urllog_file = os.path.expanduser("~/.urllog")
def previous_urls():
# cheap forward scan
urls = []
asof = ""
for line in file(urllog_file):
if not line.strip(): continue
if line.startswith("undefined"):
line = line.replace("undefined", "", 1)
if line.startswith("--- "):
asof = line.strip().replace("--- ","", 1)
urls = []
continue
urls.append(line.strip())
print len(urls), "urls retrieved as of", asof
return urls
class ForkingHTTPServer(SocketServer.ForkingMixIn, BaseHTTPServer.HTTPServer):
pass
import os
def deepmkdir(d):
if not os.path.isdir(d):
deepmkdir(os.path.dirname(d))
os.mkdir(d)
import time
import datetime
import urlparse
def crunch_url(url):
scheme, netloc, path, params, query, fragment = urlparse.urlparse(url.lower())
return (netloc.replace("www.","").replace(".com","")
+ "_" +
path.split("/")[-1].replace(".html","").replace(".htm",""))
import string
import urllib
squash_whitespace = string.maketrans(string.whitespace, " " * len(string.whitespace))
def line_safe_unquote(s):
# given encodeURIComponent, make something more
# greppable but still rfc822-safe
if not s: return ""
return urllib.unquote(s).translate(squash_whitespace)
import sets
badpunct = sets.Set()
dotpunct = sets.Set()
badpunct.update(string.punctuation)
dotpunct.update("+-=@_%")
squash_chars = "".join(badpunct - dotpunct) + string.whitespace
squash_punctuation = string.maketrans(squash_chars, "." * len(squash_chars))
def crunch_words(txt):
txt = txt.replace("%20","_")
txt = txt.replace("%0A","_")
txt = urllib.unquote(txt)
txt = txt.translate(squash_punctuation)
txt = txt.replace("..",".") # don't actually try too hard
txt = txt.replace("..",".")
txt = txt.replace("__","_") # don't actually try too hard
txt = txt.replace("__","_")
txt = txt.replace("_._",".")
# consider a split_words -> SplitWords transform here
return txt[:45]
class Stash:
def __init__(self, basedir):
self.base = basedir
def tags(self):
return [t.rstrip() for t in open(os.path.join(self.base,"tags")).readlines()]
def record(self, info):
instant = time.time()
when = datetime.date.fromtimestamp(instant)
thisdir = os.path.join(self.base,
when.strftime("%Y-%m"),
when.strftime("%Y-%m-%d"))
deepmkdir(thisdir)
this_item_parts = [time.strftime("%Y%m%d-%H%M%S", time.localtime(instant))]
if "ClipCategory" in info:
this_item_parts.append(info["ClipCategory"])
if "ClipURL" in info:
this_item_parts.append(crunch_url(info["ClipURL"]))
if "ClipSelection" in info:
this_item_parts.append(crunch_words(info["ClipSelection"]))
elif "ClipTitle" in info:
this_item_parts.append(crunch_words(info["ClipTitle"]))
this_item_parts.append("clip")
this_item = ".".join(this_item_parts)
this_item_path = os.path.join(thisdir, this_item)
# marshal info into it...
item = open(this_item_path, "w")
# rfc822-ish...
for k,v in info.items():
if k == "ClipHTML": continue
if k == "ClipSelection":
print >>item, "%s: %s" % (k,line_safe_unquote(v))
else:
print >>item, "%s: %s" % (k,v)
print >>item, ""
print >>item, info.get("ClipHTML", "")
item.close()
import sys,os
if __name__ == "__main__":
server = ForkingHTTPServer(("localhost", 3382), PageServerRequestHandler)
server.logfile = os.path.join(os.path.dirname(sys.argv[0]), "pagelog")
server.clipstash = Stash(os.path.expanduser("~/stufflog"))
server.serve_forever()