#!/usr/bin/python
# supposedly this is enough for it to work in 2.1, but I don't test it there.
from __future__ import nested_scopes
import urllib
import re
import shelve
import sys
import os
import traceback
import time
import rfc822
urllib.URLopener.version = "thok.org-comick.py-low-bandwidth-change-monitor/0.9"
def get_content(url):
u = urllib.urlopen(url)
s = u.read()
u.close()
return s
def page_feeder_notify(link):
try:
uo = urllib.URLopener()
uo.addheader('url', link)
u = uo.open("http://localhost:3383/push_url", data="")
u.read()
u.close()
except Exception, e:
print >> sys.stderr, "feeder whine:", e, "on", link
#
# Inspired by Jarno Virtanen's article on Python Owns Us, at
# http://www.hole.fi/jajvirta/weblog/20030928T2101.html
# This is structured a little differently, using urllib instead of urllib2;
# we need FancyURLopener to get redirects, but we don't want
# the http_error_default change because it loses the status, so
# we just roll that back...
#
class MyFancyURLopener(urllib.FancyURLopener):
http_error_default = urllib.URLopener.http_error_default
def get_changed_content(url, etag=None, lastmod=None):
uo = MyFancyURLopener()
if etag:
uo.addheader("If-None-Match", etag)
if lastmod:
uo.addheader("If-Modified-Since", lastmod)
try:
u = uo.open(url)
except IOError, e:
if e[0] == "http error" and e[1] == 304:
return None
raise
if u.headers.has_key("ETag"):
etag = u.headers["ETag"]
if u.headers.has_key("Last-Modified"):
lastmod = u.headers["Last-Modified"]
s = u.read()
u.close()
return (s, etag, lastmod)
img_pat = re.compile("""<img[^>]*src=['"]([^"'>]*)['"][^>]*>""", re.IGNORECASE) # "
def get_img(s):
return re.findall(img_pat, s)
def check_img_start(s, prefix):
targ = filter(lambda s: s.startswith(prefix), get_img(s))
if not targ:
return None
# a later image is likely a second comic
return targ[-1]
def check_start(s, u, arg):
return check_img_start(s, arg)
# def check_somethingpositive(s, u):
# return check_img_start(s, "arch/")
# def check_sinfest(s, u):
# return check_img_start(s, "/comics/")
def urlbase(u):
ht, path = urllib.splittype(u)
host, path = urllib.splithost(path)
slash = path.rfind("/")
if slash > -1:
path = path[0:slash]
return "%s://%s%s" % (ht, host, path)
def urlroot(u):
ht, path = urllib.splittype(u)
host, path = urllib.splithost(path)
return "%s://%s" % (ht, host)
def check_maybe(s, u, arg):
if "keenspace.com" in u:
r = old_check_maybe(s, u.replace("keenspace", "comicgenesis"), arg)
if r: return r
return old_check_maybe(s, u, arg)
def old_check_maybe(s, u, arg):
r = check_img_start(s, "/comics/")
if r: return r
r = check_img_start(s, "./comics/")
if r: return r
r = check_img_start(s, "../comics/") # questionablecontent.net
if r: return r
r = check_img_start(s, "comics/")
if r: return r
r = check_img_start(s, "Comics/") # missmab
if r: return r
r = check_img_start(s, "/comix/")
if r: return r
r = check_img_start(s, "strips/")
if r: return r
r = check_img_start(s, "/strips/")
if r: return r
r = check_img_start(s, "/hstrips/") # new dieselsweeties
if r: return r
r = check_img_start(s, "arch/")
if r: return r
r = check_img_start(s, "/archive/")
if r: return r
r = check_img_start(s, "/arch/") # new somethingpositive
if r: return r
r = check_img_start(s, "archive/")
if r: return r
r = check_img_start(s, "%s/comics/" % urlbase(u))
if r: return r
r = check_img_start(s, "%s/arch/" % urlbase(u))
if r: return r
r = check_img_start(s, "%s/strips/" % urlbase(u))
if r: return r
r = check_img_start(s, "active/")
if r: return r
r = check_img_start(s, "/active/") # furrymilitia
if r: return r
r = check_img_start(s, "%s/active/" % urlbase(u)) # badlydrawnkitties
if r: return r
r = check_img_start(s, "%s/comics/" % u) # new badlydrawnkitties
if r: return r
r = check_img_start(s, "%s/images/comics/" % urlbase(u)) # new sluggy
if r: return r
r = check_img_start(s, "archives/strips/") # for putf
if r: return r
r = check_img_start(s, "/Cartoons/") # for daybyday
if r: return r
r = check_img_start(s, "/images/comics/") # hello-cthulhu
if r: return r
r = check_img_start(s, "images/comics/") # hello-cthulhu revised
if r: return r
r = check_img_start(s, "%s/images/strips/" % urlbase(u)) # realmofatland
if r: return r
r = check_img_start(s, "images/strips/") # newer realmofatland
if r: return r
r = check_img_start(s, "imgs/comics/") # jinwicked
if r: return r
r = check_img_start(s, "/Assets/Finished") # radioactivepanda: /Assets/Finished%20Comics/Strip-0083.jpg
if r: return r
r = check_img_start(s, "%s/storage" % urlroot(u)) # partiallyclips: http://www.partiallyclips.com/storage/20050714_Researchers_lg.png
if r: return r
r = check_img_start(s, "/guest/comics/") # crfh during guest weekends
if r: return r
r = check_img_start(s, "manga/") # Miracle of Science
if r: return r
print "nothing worked, evaluate", get_img(s)
return r
def check_userfriendly(s, u, arg):
# http://www.userfriendly.org/cartoons/archives/03sep/xuf005935.gif
return check_img_start(s, "%s/cartoons/archives/" % urlroot(u))
# def check_drfun_week(s, u):
# return check_img_start(s, "Dr-Fun/inline/thumbs/")
# def check_faans(s, u):
# # http://faans.com/images/2003/995goodbye.jpg
# return check_img_start(s, "%s/images/2" % urlbase(u))
def check_start_url(s, u, arg):
return check_img_start(s, arg % urlbase(u))
# def check_pennyarcade(s, u):
# # images/2003/20030915l.gif
# return check_img_start(s, "images/2")
bruno_pad = re.compile("\\d{4}(sketch)?pics/")
def check_bruno(s, u, arg):
targ = filter(lambda s: re.match(bruno_pad, s), get_img(s))
if not targ:
return None
# a later image is likely a second comic
return targ[-1]
alt_pat = re.compile('<img[^>]*src="([^">]*)"[^>]*alt="([^">]*)"[^>]*>', re.IGNORECASE + re.MULTILINE)
def check_img_alt(s, a):
targ = [src for src, alt in re.findall(alt_pat, s) if alt.startswith(a)]
if not targ:
return None
# a later image is likely a second comic
return targ[-1]
def check_alt(s, u, arg):
return check_img_alt(s, arg)
# ozyandmillie was just a check_alt again, but now there are spurious newlines
# in long elements... as in, alt=\n"Today..." and the browser loses them too.
# Since it's stuck for a week, we kludge it...
def check_ozyandmillie(s, u, arg):
return check_alt(s.replace("\n",""), u, arg)
# def check_krakow(s, u, arg):
# return check_img_start(s, "%s/comicpro/strips/" % urlbase(u))
# def check_dieselsweeties(s, u):
# return check_img_start(s, "http://images.clango.org/strips/")
def check_re(pattern, s):
m = re.search(pattern, s)
if not m:
return None
return m.group(1)
def check_regexp(s, u, arg):
return check_re(arg, s)
# <B>LATEST COMIC</B>: <A HREF="/202.html">#202 - Too Much of a Good Thing IV</A>
# def check_sexylosers(s, u):
# return check_re('<B>LATEST COMIC</B>: <A HREF="([^"]*)">', s)
def check_sexylosers_fan(s,u):
return check_re('<B>LATEST FAN ART</B>: <A HREF = "([^"]*)">', s)
def check_sexylosers_guest(s,u):
return check_re('<B>LATEST GUEST COMIC</B>: <A HREF="([^"]*)">', s)
title_pat = re.compile("<title>([^<]*)</title>", re.IGNORECASE + re.MULTILINE)
# def check_title(s, u):
# return check_re(title_pat, s)
# def check_redmeat(s, u):
# return check_re('<a href="([^"]*)">previous</a>', s)
def check_helen(s, u, arg):
nextpage = check_re('<frame name="main" src="([^"]*)" ', s)
if not nextpage:
return None
framepage = get_content("%s%s" % (urlroot(u), nextpage))
return check_img_start(framepage, "http://www.tmsfeatures.com/")
# def check_marilith(s, u):
# return check_re("<img src='([^']*)'", s)
#
# def check_minimumsecurity(s, u):
# return check_re('<a href="([^"]*)">', s)
# def check_ponju(s, u):
# # 'piggyhunter030809.jpg'
# return check_img_start(s, "piggyhunter")
# def check_zark(s, u):
# return check_img_start(s, "../pages")
# def check_sluggy(s,u):
# # "maybe" *used* to work, but that was possibly while pics was down
# return check_img_start(s,"http://pics")
# def check_gadgeteer(s,u):
# return check_re(re.compile('past reviews.*?<a href="([^"]*)"', re.DOTALL), s)
angryflower_pat = re.compile('<img src="newest[^/]*href="([^"]*)"', re.MULTILINE)
# new form: [url, fn, arg] where arg is usually an re
sites = [
["http://www.sinfest.net/", check_maybe, None],
["http://somethingpositive.net/index.html", check_maybe, None],
["http://www.somethingpositive.net/newgolddream/", check_maybe, None],
["http://www.sluggy.com/", check_maybe, None],
# ["http://ram.purrsia.com/fwf/", check_maybe, None],
["http://www.jadephoenix.org/fwf/", check_maybe, None],
# ["http://www.furwillfly.com/", check_maybe, None],
["http://freefall.purrsia.com/default.htm", check_start, "/ff"],
# ["http://loserzcomic.keenspace.com/", check_maybe, None], # moved to:
["http://loserz.scribblekid.org/", check_maybe, None],
# "all gone" as of [eichin:20041212T0340-05]
# ["http://www.radcomics.com/", check_maybe, None],
["http://www.queenofwands.net/", check_maybe, None],
["http://commanderkitty.com/", check_maybe, None],
["http://www.userfriendly.org/static/", check_userfriendly, None],
["http://www.goats.com/", check_maybe, None],
["http://www.megagamerz.com/", check_maybe, None],
["http://www.brunostrip.com/bruno.html", check_bruno, None],
["http://www.ibiblio.org/Dave/this-week.html", check_start, "Dr-Fun/inline/thumbs/"],
["http://www.ozyandmillie.org/", check_ozyandmillie, "Today's cartoon"],
["http://www.clanofthecats.com/", check_maybe, None],
["http://www.wanderingones.com/", check_maybe, None],
["http://lcd.keenspace.com/", check_maybe, None],
# someday figure out how to handle frames at the outer level...
# ["http://www.comicspage.com/helen/index.html", check_helen, None],
["http://www.tmsfeatures.com/tmsfeatures/subcategory.jsp?custid=67&catid=1242", check_start, "http://www.tmsfeatures.com/"],
["http://www.soaprope.com/", check_maybe, None],
["http://www.ubersoft.net/", check_maybe, None],
["http://www.gpf-comics.com/", check_maybe, None],
["http://www.errantstory.com/", check_maybe, None],
["http://www.wigu.com/", check_maybe, None],
# <div class="DateHeader">Friday, November 19, 2004</div>
# so a there's an xml-ish way to say that, hmmm
["http://www.wigu.com/overcompensating/", check_regexp, 'date-header">([^<]*)</div'],
["http://strangedaze.keenspace.com/", check_maybe, None],
["http://www.nukees.com/", check_maybe, None],
["http://jackiesfridge.keenspace.com/", check_maybe, None],
["http://www.schlockmercenary.com/", check_maybe, None],
["http://nsitmc.keenspace.com/latest.html", check_maybe, None],
# faans is commercial now
# ["http://faans.com/", check_start_url, "%s/images/2"],
["http://www.sheldoncomics.com/comics/sheldon/index.html", check_maybe, None],
["http://flem.keenspace.com/", check_maybe, None],
# http://sexylosers.com/egg-redirect.html is supposed to work, try it?
["http://sexylosers.com/", check_regexp, '<B>LATEST COMIC</B>: <A HREF="([^"]*)">'],
["http://www.ghastlycomic.com/", check_maybe, None],
# ["http://www.washingtonpost.com/wp-dyn/style/columns/missmanners/", check_maybe, None],
#["http://www.dieselsweeties.com/", check_start, "http://images.clango.org/strips/"],
["http://www.dieselsweeties.com/", check_maybe, None],
["http://www.redmeat.com/redmeat/current/index.html", check_regexp, '<a href="([^"]*)">previous</a>'],
["http://www.joeaverage.org/", check_maybe, None],
["http://tonjasteele.keenspace.com/", check_maybe, None],
["http://www.choppingblock.org/", check_maybe, None],
["http://www.roadwaffles.com/", check_regexp, '<img src=(comics/rw.*\.gif)>'],
# ["http://www.eightland.com/", check_maybe, None],
["http://www.minimumsecurity.net/toons/index.htm", check_regexp, '<a href="([^"]*)">'],
# ["http://oddjobs.keenspace.com/", check_maybe, None],
# ["http://www.krakowstudios.com/", check_start_url, "%s/comicpro/strips/"],
["http://www.krakowstudios.com/", check_maybe, None],
# ["http://marilith.com/", check_regexp, "<img src='([^']*)'"],
["http://marilith.com/", check_maybe, None],
["http://www.ponju.com/PiggyHunter/comic.php", check_start, "piggyhunter"],
# dragon-tails ended 2004-11-04.
# ["http://www.dragon-tails.com/", check_maybe, None],
["http://umlauthouse.keenspace.com/", check_maybe, None],
["http://umlauthouse.comicgenesis.com/", check_maybe, None],
["http://www.nuklearpower.com/latest.php", check_maybe, None],
# ["http://www.polymercitychronicles.com/", check_maybe, None],
["http://www.polymercitychronicles.com/", check_alt, "[current strip]"],
["http://www.scarygoround.com/", check_maybe, None],
# ["http://www.writheandshine.com/index2.html", check_maybe, None],
["http://pillarsoffaith.keenspace.com/", check_regexp, title_pat],
["http://webmarines.keenspace.com/", check_maybe, None],
["http://www.mrbang.net/", check_maybe, None],
# ["http://www.avalonhigh.com/", check_maybe, None], # ended
["http://www.zark.com/front/azpages.html", check_start, "../pages"],
["http://www.catandgirl.com/", check_maybe, None],
# http://www.penny-arcade.com/images/2005/20051207h.jpg
# ["http://www.penny-arcade.com/comic", check_start, "/images"], # move to check_maybe?
["http://www.penny-arcade.com/comic", check_regexp, title_pat],
["http://www.the-gadgeteer.com/", check_regexp, re.compile('past reviews.*?<a href="([^"]*)"', re.DOTALL)],
["http://wapsisquare.com/", check_maybe, None],
# ["http://jack.keenspace.com/", check_maybe, None], # jack moved to pholph
["http://www.pholph.com/",check_maybe, None],
["http://www.pvponline.com/", check_maybe, None],
# ["http://www.fusiond.com/", check_maybe, None], # points to a generic page?
# ["http://fusiond.digitalcrap.net/", check_maybe, None], # also dead
["http://www.antiheroforhire.com/", check_maybe, None],
# movie-comics ended, should drop from here
## ["http://www.movie-comics.com/comic.php", check_maybe, None],
# ["http://www.gushi.org/~whitestorm/rdt/index.html", check_maybe, None],
#["http://www.dangerousthings.net", check_regexp, re.compile('<a title="Current comic" href="([^"]*)">', re.DOTALL)],
# ended, and the characters got distributed out to other comics :-)
#["http://www.dangerousthings.net", check_regexp, "(Current\s*Comic\s*:\s*[^<]*)<"],
["http://www.partiallyclips.com/pages/current.php", check_maybe, None],
["http://www.missmab.com/", check_maybe, None],
["http://bhag.sackofjustice.com/", check_maybe, None],
# ["http://conscrew.keenspace.com/", check_maybe, None],
["http://www.conscrew.com/index.php", check_maybe, None],
["http://wicket.conscrew.com/index.php", check_maybe, None],
["http://www.dominic-deegan.com/", check_maybe, None],
["http://www.coffeebrain.com/comic/", check_start, "images/pages"],
["http://cdc.keenspace.com/", check_maybe, None],
# the graphicsmash version died but keenspace came back, thanks to something-positive for noticing
# ["http://www.graphicsmash.com/series.php?name=lifeonforbez&view=current", check_regexp, "<img src=http://www.webcomicsnation.com/~graphicsmashers/ccuasay/([^ >]*)[^>]*>"],
# ["http://www.furrymilitia.net/comicdefault.aspx", check_maybe, None],
# ["http://www.furrymilitia.net/betterdays/", check_regexp, '<img src=([^>]*)>'], # bad html on page
["http://www.jaynaylor.com/betterdays/", check_regexp, '<img src=([^>]*)>'], # bad html on page
["http://www.badlydrawnkitties.com/", check_maybe, None],
["http://www.ok-cancel.com/", check_maybe, None],
# ["http://twolumps.keenspace.com/", check_maybe, None],
["http://www.twolumps.net/", check_maybe, None],
["http://crfh.net/", check_maybe, None],
["http://www.itswalky.com/", check_maybe, None],
# sadly, gone, some search-squatter has it
# ["http://www.w00t-comic.net/", check_regexp, "<big>(.*)</big>"],
# oh, still dead but archived:
["http://usrbinw00t.keenspace.com/", check_maybe, None],
["http://www.sorethumbsonline.com/", check_maybe, None],
["http://www.nasa.gov/multimedia/imagegallery/index.html", check_regexp, '<a href="/multimedia/imagegallery/image_feature_(.*).html">'],
["http://www.sdss.org/iotw/iotw.html", check_regexp, "<center>(.*)</center>"],
["http://ares.nrl.navy.mil/sungrazer/recent.html", check_regexp, "<b>([^<]+)</b></font></td>"],
# ["http://antwrp.gsfc.nasa.gov/apod/", check_regexp, " Explanation: ([\0-\377]*) Tomorrow"], # or put an re.DOTALL in check_regexp
# doesn't work, because we're not quoting the match...
# ["http://antwrp.gsfc.nasa.gov/apod/", check_regexp, '<IMG SRC="([^"]*)"'],
["http://antwrp.gsfc.nasa.gov/apod/", check_start, "image/"],
["http://lorebrandcomics.com/", check_alt, "[Lore:"],
# putf is now over, and hitting it gives a randomly chosen comic
# ["http://www.accendi.net/putf/", check_maybe, None],
# ["http://ohmygods.timerift.net/", check_maybe, None],
["http://ohmygods.timerift.net/", check_regexp, '#BeginEditable "day" -->([^<]*)<'],
["http://www.daybydaycartoon.com/Default.aspx", check_maybe, None],
["http://www.vanvonhunter.com/index.html", check_maybe, None],
["http://www.drunkduck.com/Elijah_and_Azuu/", check_regexp, title_pat],
["http://www.drunkduck.com/The_Whovian_Observer/", check_regexp, title_pat],
["http://www.asofterworld.com/", check_start, ""],
["http://underpower.non-essential.com/", check_maybe, None],
["http://www.littledee.net/", check_bruno, None],
["http://www.vigilanteho.com/", check_maybe, None],
# ["http://www.pvcomics.com/atland/", check_start, "http://www.pvcomics.com/comics/atland/"],
["http://www.realmofatland.com/", check_maybe, None],
["http://www.questionablecontent.net/", check_maybe, None],
["http://www.hello-cthulhu.com/", check_maybe, None],
# there's other stuff there, but Metroid is gone
# ["http://www.bobandgeorge.com/Fancomics/Metroid/Metroid.html", check_start, "Metroid"],
["http://www.petprofessional.net/", check_maybe, None],
["http://www.library-of-rain.com/botmaker/index.php", check_alt, "Strip"],
["http://www.radioactivepanda.com/", check_maybe, None],
["http://crap.jinwicked.com/", check_maybe, None],
["http://www.unshelved.com/", check_maybe, None],
["http://devilspanties.keenspot.com/", check_maybe, None], # mostly for the boston jokes
["http://www.evil-comic.com/index.html", check_maybe, None], # via schlockmercenary
["http://angryflower.com/", check_regexp, angryflower_pat],
["http://www.bugbash.net/", check_maybe, None],
["http://crossroads.keenspace.com/", check_maybe, None],
["http://www.galactanet.com/comic/index.htm", check_regexp, '<img\s*src\s*=\s*"\s*(Strip.*\....)">'],
["http://www.project-apollo.net/mos/index.html", check_maybe, None],
["http://www.candicomics.com/", check_maybe, None],
["http://www.elisalandis.com/", check_maybe, None],
["http://www.ctrlaltdel-online.com/comic.php", check_maybe, None],
["http://www.starslipcrisis.com/", check_maybe, None],
]
rss_header = """<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="feed.xsl"?>
<rss version="0.91">
<channel>
<title>Comick Update</title>
<lastBuildDate>%s</lastBuildDate>
<description>webcomics that have changed</description>
<link>http://www.thok.org/intranet/python/comics/comick.py.html</link>
"""
rss_header_old = rss_header.replace('<?xml-stylesheet type="text/xsl" href="feed.xsl"?>\n',"")
rss_item = """<item>
<title>%s</title>
<link>%s</link>
<description>%s</description>
<guid isPermaLink="false">%s</guid>
</item>
"""
rss_footer = """</channel>
</rss>
"""
time_day = 24*60*60
rest_time = time_day / 4
check_rest_time = 1*60*60
import md5
def make_guid(s):
return md5.new(s).hexdigest()
class rssfile:
def __init__(self, name):
self.olditems = ""
self.ref = {}
self.dsc = {}
self.order = []
if os.path.isfile(name):
f = open(name, "r")
s = f.read()
f.close()
# handle one-rev-back rss header, for add-stylesheet case
if re.match(rss_header_old % "[^<]*?", s):
s = re.sub(rss_header_old % "[^<]*?", "", s, 1)
else:
s = re.sub(rss_header % "[^<]*?", "", s, 1)
s = s.replace(rss_footer, "", 1)
self.itemsplit(s)
self.tmpname = "%s~" % name
self.realname = name
self.rssfile = open(self.tmpname, "w")
self.rssfile.write(rss_header % rfc822.formatdate())
def write_item_raw(self, title, link, desc):
self.rssfile.write(rss_item % (entity_quote(title), entity_quote(link),
entity_quote(desc), make_guid(desc)))
self.rssfile.flush()
def write_item(self, title, link, desc):
self.write_item_raw(title, link, desc)
if link in self.ref.keys():
del self.ref[link]
page_feeder_notify(link)
def close(self):
self.oldwrite()
self.rssfile.write(rss_footer)
os.rename(self.tmpname, self.realname)
itemre = re.compile("\n".join(["<item>",
"<title>(?P<title>[^<]*)</title>",
"<link>(?P<link>[^<]*)</link>",
"<description>(?P<description>[^<]*)</description>",
'<guid( isPermaLink="false")?>(?P<guid>[^<]*)</guid>',
"</item>"]), re.MULTILINE|re.I)
def itemsplit(self,s):
for title, link, desc, dummy, guid in re.findall(self.itemre, s):
if link not in self.order:
self.ref[link] = entity_unquote(title)
self.dsc[link] = entity_unquote(desc)
self.order.append(entity_unquote(link))
def oldwrite(self):
for link in self.order:
if link in self.ref.keys():
self.write_item_raw(self.ref[link], link, self.dsc[link])
verbose = None
def entity_quote(s):
return s.replace("&", "&") # add lt/gt later
def entity_unquote(s):
return s.replace("&", "&") # add lt/gt later
def fmt_time(t):
# one good use for APL reduce-floor...
sign = ""
if t < 0:
sign = "-"
t = -t
s = t % 60
t -= s
m = (t / 60) % 60
t -= m * 60
h = (t / 60 / 60) % 24
t -= h * 60 * 60
d = (t / 60 / 60 / 24)
if d:
return "%s%dd+%02dh%02dm%02ds" % (sign,d,h,m,s)
return "%s%02dh%02dm%02ds" % (sign,h,m,s)
def process_db(dbname):
sh = shelve.open(dbname)
rf = rssfile("%s.rss" % dbname)
total_comics = 0
failed_comics = 0
changed_comics = 0
nextcheck = time.time() + rest_time + check_rest_time
for u, checkproc, arg in sites:
total_comics += 1
lastgot = 0
lastcheck = 0
lastetag = None
lastmodtime = None
arghash={}
if sh.has_key(u):
arghash = sh[u]
lastgot = arghash["last-changed"]
tag = arghash["tag"]
lastcheck = arghash["last-queried"]
if arghash.has_key("etag"):
lastetag = arghash["etag"]
if arghash.has_key("last-modtime"):
lastmodtime = arghash["last-modtime"]
now = time.time()
if lastgot + rest_time > now:
if verbose: print u, "not stale yet"
nextcheck = min(lastgot + rest_time, nextcheck)
continue
if lastcheck + check_rest_time > now:
if verbose: print u, "poked recently"
nextcheck = min(lastcheck + check_rest_time, nextcheck)
continue
# assume any not-poked will trigger sooner than any poked
# not true in several cases, though a second run will be right
try:
sch = get_changed_content(u, lastetag, lastmodtime)
except IOError, e:
print "fetch", u, "failed:", e.args
failed_comics += 1
continue
except KeyboardInterrupt:
print "Processing", u, "interrupted, saving current values"
break
except:
print "fetch", u, "failed!", repr(traceback.format_tb(sys.exc_info()[2]))
continue
if not sch:
if verbose: print u, "unfetched: etag or last-mod still current"
arghash["last-queried"] = now
nextcheck = min(now + check_rest_time, nextcheck)
sh[u] = arghash
continue
s, lastetag, lastmodtime = sch
newtag = checkproc(s, u, arg)
if verbose: print u
if not newtag:
print u, "not handled"
failed_comics += 1
continue
arghash["tag"] = newtag
arghash["last-queried"] = now
arghash["etag"] = lastetag
arghash["last-modtime"] = lastmodtime
if not sh.has_key(u):
arghash["last-changed"] = now
nextcheck = min(now + rest_time, nextcheck)
rf.write_item("first time: %s" % newtag, u, "%s: first time %s @ %s" % (u, entity_quote(newtag), now))
changed_comics += 1
elif tag != newtag:
print "tag for", u, "changed from", tag, "to", newtag
arghash["last-changed"] = now
nextcheck = min(now + rest_time, nextcheck)
rf.write_item(newtag, u, "%s: %s changed to %s @ %s" % (u, entity_quote(tag), entity_quote(newtag), now))
changed_comics += 1
# otherwise, last-changed stays
sh[u] = arghash
sh.close()
rf.close()
checkwait = nextcheck - time.time()
print "%d changed, %d failed (out of %d total) [wait %s until %s]" % (changed_comics, failed_comics, total_comics, fmt_time(checkwait), time.ctime(nextcheck))
def scan_db(dbname):
sh = shelve.open(dbname)
for u in sh.keys():
print u, sh[u]
sh.close()
def summary_db(dbname):
sitekeys = [i[0] for i in sites]
sh = shelve.open(dbname)
etagscount = 0
lastmodcount = 0
keycount = 0
for u in sh.keys():
args = sh[u]
if args.has_key("etag") and args["etag"]:
etagscount += 1
if args.has_key("last-modtime") and args["last-modtime"]:
lastmodcount += 1
if u in sitekeys:
del sitekeys[sitekeys.index(u)]
else:
print u, "not in sites"
keycount += 1
sh.close()
for u in sitekeys:
print u, "in sites, not in db"
print "etags found:", etagscount
print "times found:", lastmodcount
print "total found:", keycount
def fix_db1(dbname):
sh = shelve.open(dbname)
now = time.time()
for u in sh.keys():
when, tag = sh[u]
sh[u] = (when, tag, now)
sh.close()
def fix_db2(dbname):
sh = shelve.open(dbname)
now = time.time()
for u in sh.keys():
when, tag, now = sh[u]
sh[u] = {"when": when, "tag": tag, "now": now}
sh.close()
def fix_db3(dbname):
sh = shelve.open(dbname)
now = time.time()
for u in sh.keys():
args = sh[u]
sh[u] = {"last-changed":args["when"], "tag": args["tag"], "last-queried":args["now"]}
sh.close()
import pprint
def diag_db(dbname):
sh = shelve.open(dbname)
print "Url fragment:"
ufrag = sys.stdin.readline()
for u, checkproc, arg in sites:
if u.lower().find(ufrag.rstrip()) > -1:
print "Checking:", u, checkproc, arg
if sh.has_key(u):
arghash = sh[u]
print "Old args:", pprint.pformat(arghash)
print "age:", (time.time()-arghash["last-changed"])/(24*60*60),"days"
sch = get_changed_content(u, None, None)
s, lastetag, lastmodtime = sch
newtag = checkproc(s, u, arg)
print "New tag:", newtag
sh.close()
def show_cruft_db(dbname):
sh = shelve.open(dbname)
for u, checkproc, arg in sites:
if sh.has_key(u):
arghash = sh[u]
age = time.time()-arghash["last-changed"]
if age/(24*60*60) > 30:
print fmt_time(age), "\t", u
sh.close()
import socket
if __name__ == "__main__":
socket.setdefaulttimeout(15)
try:
progname, dbname, verb = sys.argv
except ValueError:
sys.exit(sys.argv[0] + " dbname {update|scan|fix|summary} - you probably want comdb update")
{"update":process_db,
"scan":scan_db,
"fix":fix_db3,
"summary":summary_db,
"diag":diag_db,
"cruft":show_cruft_db,
}[verb.lower()](dbname)