cleanups on wayback functionality
@@ -13,7 +13,8 @@ import re
import asyncio import sqlite3 import json -#import base64 + +# import base64 from shutil import copy2 as cp from urllib.parse import urlparse from collections import namedtuple@@ -38,6 +39,7 @@ from pandoc import PandocMD2HTML, PandocMD2TXT, PandocHTML2TXT
from meta import Exif import settings import keys +import wayback logger = logging.getLogger("NASG")@@ -140,7 +142,7 @@
def extractdomain(url): url = urlparse(url) - return url.netloc + return url.hostname J2 = jinja2.Environment(@@ -387,6 +389,7 @@ "disambiguatingDescription": self.type,
} return r + class WebImage(object): def __init__(self, fpath, mdimg, parent): logger.debug("loading image: %s", fpath)@@ -603,9 +606,7 @@ mapping = {
"Model": ["Model"], "FNumber": ["FNumber", "Aperture"], "ExposureTime": ["ExposureTime"], - "FocalLength": [ - "FocalLength" - ], + "FocalLength": ["FocalLength"], "ISO": ["ISO"], "LensID": ["LensID", "LensSpec", "Lens"], "CreateDate": ["CreateDate", "DateTimeOriginal"],@@ -681,14 +682,14 @@ self.parent = parent
self.size = size self.crop = crop - #@property - #def data(self): - #with open(self.fpath, "rb") as f: - #encoded = base64.b64encode(f.read()) - #return "data:%s;base64,%s" % ( - #self.parent.mime_type, - #encoded.decode("utf-8"), - #) + # @property + # def data(self): + # with open(self.fpath, "rb") as f: + # encoded = base64.b64encode(f.read()) + # return "data:%s;base64,%s" % ( + # self.parent.mime_type, + # encoded.decode("utf-8"), + # ) @property def suffix(self):@@ -806,6 +807,7 @@ self.fpath = fpath
self.dirpath = os.path.dirname(fpath) self.name = os.path.basename(self.dirpath) self.category = os.path.basename(os.path.dirname(self.dirpath)) + self.pointers = [] @cached_property def files(self):@@ -848,7 +850,9 @@ - and have an actual image file at the same directory level as
the Singular object """ images = {} - for match, alt, fname, title, css in RE_MDIMG.findall(self.content): + for match, alt, fname, title, css in RE_MDIMG.findall( + self.content + ): mdimg = MarkdownImage(match, alt, fname, title, css) imgpath = os.path.join(self.dirpath, fname) if imgpath in self.files:@@ -1002,7 +1006,7 @@ w = Webmention(
self.url, url, os.path.dirname(self.fpath), - self.dt.timestamp + self.dt.timestamp, ) webmentions.append(w) if self.is_reply:@@ -1010,7 +1014,7 @@ w = Webmention(
self.url, self.is_reply, os.path.dirname(self.fpath), - self.dt.timestamp + self.dt.timestamp, ) webmentions.append(w) return webmentions@@ -1143,7 +1147,6 @@
if self.event: r.update({"subjectOf": self.event}) - for url in list(set(self.to_syndicate)): r["potentialAction"].append( {@@ -1199,7 +1202,9 @@ return True
@property def corpus(self): - return "\n".join([self.title, self.name, self.summary, self.content]) + return "\n".join( + [self.title, self.name, self.summary, self.content] + ) async def copy_files(self): exclude = [@@ -1231,12 +1236,35 @@ continue
logger.info("copying '%s' to '%s'", f, t) cp(f, t) - async def save_to_archiveorg(self): requests.get(f"http://web.archive.org/save/{self.url}") + async def get_from_archiveorg(self): + done = glob.glob( + os.path.join(self.dirpath, f"*archiveorg*.copy") + ) + if done: + logger.debug( + "archive.org .copy exists for %s at %s", + self.name, + done[0], + ) + return + logger.info("trying to get archive.org .copy for %s", self.name) + if len(self.category): + wb = wayback.FindWaybackURL( + self.name, self.category, self.pointers + ) + wb.run() + if len(wb.oldest): + archiveurl = url2slug(wb.oldest) + t = os.path.join(self.dirpath, f"{archiveurl}.copy") + writepath(t, wb.oldest) async def render(self): + if settings.args.get("memento"): + await self.get_from_archiveorg() + if self.exists: return True@@ -1247,7 +1275,7 @@ "post": self.jsonld,
"site": settings.site, "menu": settings.menu, "meta": settings.meta, - "fnames": settings.filenames + "fnames": settings.filenames, } writepath( self.renderfile, J2.get_template(self.template).render(v)@@ -1260,8 +1288,7 @@ "summary": self.txt_summary,
"content": self.txt_content, } writepath( - self.txtfile, - J2.get_template(self.txttemplate).render(g), + self.txtfile, J2.get_template(self.txttemplate).render(g) ) del g@@ -1300,12 +1327,7 @@ ts = max(ts, arrow.get(post["dateModified"]).timestamp)
return arrow.get(ts) async def render_gopher(self): - lines = [ - "%s's gopherhole" - % (settings.site.name), - "", - "", - ] + lines = ["%s's gopherhole" % (settings.site.name), "", ""] for category, post in self.posts: line = "1%s\t/%s/%s\t%s\t70" % (@@ -1585,7 +1607,7 @@ @property
def renderdir(self): b = settings.paths.build if len(self.name): - b = os.path.join(b,settings.paths.category, self.name) + b = os.path.join(b, settings.paths.category, self.name) return b @property@@ -1623,11 +1645,19 @@ self.parent = parent
@property def mtime(self): - return max(list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination]) + return max( + list(sorted(self.parent.keys(), reverse=True))[ + 0 : settings.pagination + ] + ) @property def renderfile(self): - return os.path.join(self.parent.renderdir, settings.paths.feed, settings.filenames.json) + return os.path.join( + self.parent.renderdir, + settings.paths.feed, + settings.filenames.json, + ) @property def exists(self):@@ -1641,10 +1671,14 @@ return False
async def render(self): if self.exists: - logger.debug("category %s is up to date", self.parent.name) + logger.debug( + "category %s is up to date", self.parent.name + ) return - logger.info("rendering JSON feed for category %s", self.parent.name) + logger.info( + "rendering JSON feed for category %s", self.parent.name + ) js = { "version": "https://jsonfeed.org/version/1",@@ -1659,7 +1693,9 @@ },
"items": [], } - for key in list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination]: + for key in list(sorted(self.parent.keys(), reverse=True))[ + 0 : settings.pagination + ]: post = self.parent[key] pjs = { "id": post.url,@@ -1676,12 +1712,15 @@ {
"attachment": { "url": post.photo.href, "mime_type": post.photo.mime_type, - "size_in_bytes": f"{post.photo.mime_size}" + "size_in_bytes": f"{post.photo.mime_size}", } } ) js["items"].append(pjs) - writepath(self.renderfile,json.dumps(js, indent=4, ensure_ascii=False)) + writepath( + self.renderfile, + json.dumps(js, indent=4, ensure_ascii=False), + ) class XMLFeed(object): def __init__(self, parent, feedformat="rss"):@@ -1690,7 +1729,11 @@ self.feedformat = feedformat
@property def mtime(self): - return max(list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination]) + return max( + list(sorted(self.parent.keys(), reverse=True))[ + 0 : settings.pagination + ] + ) @property def renderfile(self):@@ -1700,7 +1743,9 @@ elif "atom" == self.feedformat:
fname = settings.filenames.atom else: fname = "index.xml" - return os.path.join(self.parent.renderdir, settings.paths.feed, fname) + return os.path.join( + self.parent.renderdir, settings.paths.feed, fname + ) @property def exists(self):@@ -1714,10 +1759,16 @@ return False
async def render(self): if self.exists: - logger.debug("category %s is up to date", self.parent.name) + logger.debug( + "category %s is up to date", self.parent.name + ) return - logger.info("rendering %s feed for category %s", self.feedformat, self.parent.name) + logger.info( + "rendering %s feed for category %s", + self.feedformat, + self.parent.name, + ) fg = FeedGenerator() fg.id(self.parent.feedurl)@@ -1732,12 +1783,14 @@ "email": settings.author.email,
} ) if self.feedformat == "rss": - fg.link(href=self.feedurl) + fg.link(href=self.parent.feedurl) elif self.feedformat == "atom": - fg.link(href=self.feedurl, rel="self") + fg.link(href=self.parent.feedurl, rel="self") fg.link(href=settings.meta.get("hub"), rel="hub") - for key in list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination]: + for key in list(sorted(self.parent.keys(), reverse=True))[ + 0 : settings.pagination + ]: post = self.parent[key] fe = fg.add_entry()@@ -1787,7 +1840,6 @@ fe.summary(post.summary)
writepath(self.renderfile, fg.atom_str(pretty=True)) - class Year(object): def __init__(self, parent, year): self.parent = parent@@ -1814,9 +1866,15 @@
@property def renderfile(self): if self.year == self.parent.newest_year: - return os.path.join(self.parent.renderdir, settings.filenames.html) + return os.path.join( + self.parent.renderdir, settings.filenames.html + ) else: - return os.path.join(self.parent.renderdir, self.year, settings.filenames.html) + return os.path.join( + self.parent.renderdir, + self.year, + settings.filenames.html, + ) @property def baseurl(self):@@ -1854,19 +1912,25 @@ "feed": self.parent.feedurl,
"title": self.parent.title, "paginated": True, "years": self.parent.years, - "year": self.year + "year": self.year, }, - "posts": self.posttmplvars + "posts": self.posttmplvars, } async def render(self): if self.exists: - logger.debug("category %s is up to date", self.parent.name) + logger.debug( + "category %s is up to date", self.parent.name + ) return - logger.info("rendering year %s for category %s", self.year, self.parent.name) + logger.info( + "rendering year %s for category %s", + self.year, + self.parent.name, + ) r = J2.get_template(self.template).render(self.tmplvars) writepath(self.renderfile, r) - del(r) + del r class Flat(object): def __init__(self, parent):@@ -1876,7 +1940,9 @@ @property
def posttmplvars(self): return [ self.parent[key].jsonld - for key in list(sorted(self.parent.keys(), reverse=True)) + for key in list( + sorted(self.parent.keys(), reverse=True) + ) ] @property@@ -1885,7 +1951,9 @@ return max(self.parent.keys())
@property def renderfile(self): - return os.path.join(self.parent.renderdir, settings.filenames.html) + return os.path.join( + self.parent.renderdir, settings.filenames.html + ) @property def template(self):@@ -1915,17 +1983,19 @@ "url": self.parent.url,
"feed": self.parent.feedurl, "title": self.parent.title, }, - "posts": self.posttmplvars + "posts": self.posttmplvars, } async def render(self): if self.exists: - logger.debug("category %s is up to date", self.parent.name) + logger.debug( + "category %s is up to date", self.parent.name + ) return logger.info("rendering category %s", self.parent.name) r = J2.get_template(self.template).render(self.tmplvars) writepath(self.renderfile, r) - del(r) + del r class Gopher(object): def __init__(self, parent):@@ -1947,17 +2017,27 @@ return False
@property def renderfile(self): - return os.path.join(self.parent.renderdir, settings.filenames.gopher) + return os.path.join( + self.parent.renderdir, settings.filenames.gopher + ) async def render(self): if self.exists: - logger.debug("category %s is up to date", self.parent.name) + logger.debug( + "category %s is up to date", self.parent.name + ) return - lines = ["%s - %s" % (self.parent.name, settings.site.name), "", ""] + lines = [ + "%s - %s" % (self.parent.name, settings.site.name), + "", + "", + ] for post in [ self.parent[key] - for key in list(sorted(self.parent.keys(), reverse=True)) + for key in list( + sorted(self.parent.keys(), reverse=True) + ) ]: line = "0%s\t/%s/%s\t%s\t70" % ( post.title,@@ -1978,6 +2058,7 @@ )
lines.append(line) lines.append("") writepath(self.renderfile, "\r\n".join(lines)) + class Sitemap(dict): @property@@ -2058,7 +2139,6 @@ logger.error("sending failed: %s %s", r.status_code, r.text)
else: self.save(r.text) - def backfill_syndication(self): """ this is very specific to webmention.io and brid.gy publish """@@ -2105,9 +2185,7 @@ )
if "url" in maybe["http_body"]: data = json.loads(maybe["http_body"]) url = data["url"] - sp = os.path.join( - self.dpath, "%s.copy" % url2slug(url) - ) + sp = os.path.join(self.dpath, "%s.copy" % url2slug(url)) if os.path.exists(sp): return with open(sp, "wt") as f:@@ -2122,6 +2200,7 @@ self.dpath,
e, ) pass + class WebmentionIO(object): def __init__(self):@@ -2258,12 +2337,22 @@ categories = {}
frontposts = Category() home = Home(settings.paths.get("home")) + reverse_redirects = {} + for e in glob.glob(os.path.join(content, "*", "*.url")): + post = Redirect(e) + rules.add_redirect(post.source, post.target) + if post.target not in reverse_redirects: + reverse_redirects[post.target] = [] + reverse_redirects[post.target].append(post.source) + for e in sorted( glob.glob( os.path.join(content, "*", "*", settings.filenames.md) ) ): post = Singular(e) + if post.url in reverse_redirects: + post.pointers = reverse_redirects[post.target] # deal with images, if needed for i in post.images.values(): queue.put(i.downsize())@@ -2279,11 +2368,11 @@ # skip draft posts from anything further
if post.is_future: logger.info("%s is for the future", post.name) continue - elif not os.path.exists(post.renderfile): - logger.debug( - "%s seems to be fist time published", post.name - ) - firsttimepublished.append(post) + # elif not os.path.exists(post.renderfile): + # logger.debug( + # "%s seems to be fist time published", post.name + # ) + # firsttimepublished.append(post) # add post to search database search.append(post)@@ -2330,7 +2419,7 @@ for category in categories.values():
home.add(category, category.get(category.sortedkeys[0])) queue.put(category.render()) - #queue.put(frontposts.render_feeds()) + # queue.put(frontposts.render_feeds()) queue.put(home.render()) # actually run all the render & copy tasks queue.run()@@ -2377,9 +2466,8 @@ queue.put(wm.send())
queue.run() logger.info("sending webmentions finished") - for post in firsttimepublished: - queue.put(post.save_memento()) - queue.put(post.save_to_archiveorg()) + # for post in firsttimepublished: + # queue.put(post.save_to_archiveorg()) queue.run()
@@ -205,6 +205,36 @@ "^/wp-content/.*$",
"^/broadcast/wp-ffpc\.message$", ] +formerdomains = [ + "cadeyrn.webporfolio.hu", + "blog.petermolnar.eu", + "petermolnar.eu", +] + +formercategories = { + "article": [ + "linux-tech-coding", + "diy-do-it-yourself", + "sysadmin-blog", + "sysadmin", + "szubjektiv-technika", + "wordpress" + ], + "note": [ + "blips", + "blog", + "r" + ], + "journal": [ + "blog", + ], + "photo": [ + "photoblog", + "fotography", + ] +} + + if os.path.isdir("/dev/shm") and os.access("/dev/shm", os.W_OK): tmpdir = "/dev/shm/nasg" else:
@@ -222,4 +222,9 @@ </symbol>
<symbol id="icon-www.flickr.com" viewBox="0 0 16 16"> <path fill="#0063dc" d="M0 8c0 2.049 1.663 3.709 3.71 3.709 2.050 0 3.713-1.66 3.713-3.709s-1.662-3.709-3.713-3.709c-2.047 0-3.71 1.66-3.71 3.709zM8.577 8c0 2.049 1.662 3.709 3.711 3.709 2.042 0 3.711-1.66 3.711-3.709s-1.661-3.709-3.709-3.709c-2.050 0-3.713 1.66-3.713 3.709z"></path> </symbol> + <symbol id="icon-web.archive.org" viewBox="0 0 17 16"> + <path d="M16 15v-1h-1v-6h1v-1h-3v1h1v6h-3v-6h1v-1h-3v1h1v6h-3v-6h1v-1h-3v1h1v6h-3v-6h1v-1h-3v1h1v6h-1v1h-1v1h17v-1h-1z"></path> + <path d="M8 0h1l8 5v1h-17v-1l8-5z"></path> + </symbol> + </svg>
@@ -13,21 +13,26 @@ from collections import deque
from urllib.parse import urlparse import settings import arrow +from time import sleep logger = logging.getLogger("wayback") logger.setLevel(10) console_handler = logging.StreamHandler() -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) console_handler.setFormatter(formatter) logger.addHandler(console_handler) from pprint import pprint -RE_FIRST = re.compile(r"^\<(?P<url>[^>]+)\>; rel=\"first memento\"; datetime=\"(?P<datetime>[^\"]+).*$") +RE_FIRST = re.compile( + r"^\<(?P<url>[^>]+)\>; rel=\"first memento\"; datetime=\"(?P<datetime>[^\"]+).*$" +) + class FindWaybackURL(object): - def __init__(self, path, category="", redirects=[]): self.path = path self.category = category@@ -49,39 +54,74 @@
for domain in domains: q[f"http://{domain}/{path}/"] = True if self.category in settings.formercategories: - categories = settings.formercategories[self.category] + categories = settings.formercategories[ + self.category + ] else: categories = [] categories.append(self.category) for category in categories: q[f"http://{domain}/{category}/{path}/"] = True - q[f"http://{domain}/category/{category}/{path}/"] = True - #logger.info("possible urls: %s", json.dumps(list(q.keys()), indent=4, ensure_ascii=False)) + q[ + f"http://{domain}/category/{category}/{path}/" + ] = True + # logger.info("possible urls: %s", json.dumps(list(q.keys()), indent=4, ensure_ascii=False)) return list(q.keys()) def get_first_memento(self, url): target = f"http://web.archive.org/web/timemap/link/{url}" + logger.info("requesting %s", url) mementos = requests.get(target) - if not mementos.text: - return None - for memento in mementos.text.split("\n"): - m = RE_FIRST.match(memento) - if m: - return settings.nameddict({ - 'epoch': int(arrow.get(m.group('datetime'), "ddd, DD MMM YYYY HH:mm:ss ZZZ").to("utc").timestamp), - 'url': m.group('url') - }) + if mementos.status_code == requests.codes.ok: + if not len(mementos.text): + logger.debug("empty memento response for %s", target) + for memento in mementos.text.split("\n"): + m = RE_FIRST.match(memento) + if m: + + r = settings.nameddict( + { + "epoch": int( + arrow.get( + m.group("datetime"), + "ddd, DD MMM YYYY HH:mm:ss ZZZ", + ) + .to("utc") + .timestamp + ), + "url": m.group("url"), + } + ) + logger.info("found memento candidate: %s", r) + return r + else: + logger.debug( + "no first memento found at: %s", target + ) + else: + logger.warning( + "request failed: %s, status: %s, txt: %s", + mementos, + mementos.status_code, + mementos.text, + ) def run(self): l = self.possible_urls() - logging.info("running archive.org lookup for %s", self.path) + logger.info("running archive.org lookup for %s", self.path) for url in l: maybe = self.get_first_memento(url) if maybe: if maybe.epoch < self.epoch: self.epoch = maybe.epoch self.oldest = maybe.url + sleep(.500) if not len(self.oldest): logger.error("no memento found for %s", self.path) else: - logger.info("\t\toldest found memento for %s: %s :: %s", self.path, str(arrow.get(self.epoch)), self.oldest) + logger.info( + "\t\toldest found memento for %s: %s :: %s", + self.path, + str(arrow.get(self.epoch)), + self.oldest, + )