cleanups on wayback functionality

This commit is contained in:
Peter Molnar 2019-08-14 11:28:01 +01:00
parent 0fcbfe0bd9
commit 9e0e58a4c6
4 changed files with 253 additions and 90 deletions

234
nasg.py
View file

@ -13,7 +13,8 @@ import re
import asyncio
import sqlite3
import json
#import base64
# import base64
from shutil import copy2 as cp
from urllib.parse import urlparse
from collections import namedtuple
@ -38,6 +39,7 @@ from pandoc import PandocMD2HTML, PandocMD2TXT, PandocHTML2TXT
from meta import Exif
import settings
import keys
import wayback
logger = logging.getLogger("NASG")
@ -140,7 +142,7 @@ def maybe_copy(source, target):
def extractdomain(url):
url = urlparse(url)
return url.netloc
return url.hostname
J2 = jinja2.Environment(
@ -387,6 +389,7 @@ class Comment(MarkdownDoc):
}
return r
class WebImage(object):
def __init__(self, fpath, mdimg, parent):
logger.debug("loading image: %s", fpath)
@ -603,9 +606,7 @@ class WebImage(object):
"Model": ["Model"],
"FNumber": ["FNumber", "Aperture"],
"ExposureTime": ["ExposureTime"],
"FocalLength": [
"FocalLength"
],
"FocalLength": ["FocalLength"],
"ISO": ["ISO"],
"LensID": ["LensID", "LensSpec", "Lens"],
"CreateDate": ["CreateDate", "DateTimeOriginal"],
@ -681,14 +682,14 @@ class WebImage(object):
self.size = size
self.crop = crop
#@property
#def data(self):
#with open(self.fpath, "rb") as f:
#encoded = base64.b64encode(f.read())
#return "data:%s;base64,%s" % (
#self.parent.mime_type,
#encoded.decode("utf-8"),
#)
# @property
# def data(self):
# with open(self.fpath, "rb") as f:
# encoded = base64.b64encode(f.read())
# return "data:%s;base64,%s" % (
# self.parent.mime_type,
# encoded.decode("utf-8"),
# )
@property
def suffix(self):
@ -806,6 +807,7 @@ class Singular(MarkdownDoc):
self.dirpath = os.path.dirname(fpath)
self.name = os.path.basename(self.dirpath)
self.category = os.path.basename(os.path.dirname(self.dirpath))
self.pointers = []
@cached_property
def files(self):
@ -848,7 +850,9 @@ class Singular(MarkdownDoc):
the Singular object
"""
images = {}
for match, alt, fname, title, css in RE_MDIMG.findall(self.content):
for match, alt, fname, title, css in RE_MDIMG.findall(
self.content
):
mdimg = MarkdownImage(match, alt, fname, title, css)
imgpath = os.path.join(self.dirpath, fname)
if imgpath in self.files:
@ -1002,7 +1006,7 @@ class Singular(MarkdownDoc):
self.url,
url,
os.path.dirname(self.fpath),
self.dt.timestamp
self.dt.timestamp,
)
webmentions.append(w)
if self.is_reply:
@ -1010,7 +1014,7 @@ class Singular(MarkdownDoc):
self.url,
self.is_reply,
os.path.dirname(self.fpath),
self.dt.timestamp
self.dt.timestamp,
)
webmentions.append(w)
return webmentions
@ -1143,7 +1147,6 @@ class Singular(MarkdownDoc):
if self.event:
r.update({"subjectOf": self.event})
for url in list(set(self.to_syndicate)):
r["potentialAction"].append(
{
@ -1199,7 +1202,9 @@ class Singular(MarkdownDoc):
@property
def corpus(self):
return "\n".join([self.title, self.name, self.summary, self.content])
return "\n".join(
[self.title, self.name, self.summary, self.content]
)
async def copy_files(self):
exclude = [
@ -1231,12 +1236,35 @@ class Singular(MarkdownDoc):
logger.info("copying '%s' to '%s'", f, t)
cp(f, t)
async def save_to_archiveorg(self):
requests.get(f"http://web.archive.org/save/{self.url}")
async def get_from_archiveorg(self):
done = glob.glob(
os.path.join(self.dirpath, f"*archiveorg*.copy")
)
if done:
logger.debug(
"archive.org .copy exists for %s at %s",
self.name,
done[0],
)
return
logger.info("trying to get archive.org .copy for %s", self.name)
if len(self.category):
wb = wayback.FindWaybackURL(
self.name, self.category, self.pointers
)
wb.run()
if len(wb.oldest):
archiveurl = url2slug(wb.oldest)
t = os.path.join(self.dirpath, f"{archiveurl}.copy")
writepath(t, wb.oldest)
async def render(self):
if settings.args.get("memento"):
await self.get_from_archiveorg()
if self.exists:
return True
@ -1247,7 +1275,7 @@ class Singular(MarkdownDoc):
"site": settings.site,
"menu": settings.menu,
"meta": settings.meta,
"fnames": settings.filenames
"fnames": settings.filenames,
}
writepath(
self.renderfile, J2.get_template(self.template).render(v)
@ -1260,8 +1288,7 @@ class Singular(MarkdownDoc):
"content": self.txt_content,
}
writepath(
self.txtfile,
J2.get_template(self.txttemplate).render(g),
self.txtfile, J2.get_template(self.txttemplate).render(g)
)
del g
@ -1300,12 +1327,7 @@ class Home(Singular):
return arrow.get(ts)
async def render_gopher(self):
lines = [
"%s's gopherhole"
% (settings.site.name),
"",
"",
]
lines = ["%s's gopherhole" % (settings.site.name), "", ""]
for category, post in self.posts:
line = "1%s\t/%s/%s\t%s\t70" % (
@ -1585,7 +1607,7 @@ class Category(dict):
def renderdir(self):
b = settings.paths.build
if len(self.name):
b = os.path.join(b,settings.paths.category, self.name)
b = os.path.join(b, settings.paths.category, self.name)
return b
@property
@ -1623,11 +1645,19 @@ class Category(dict):
@property
def mtime(self):
return max(list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination])
return max(
list(sorted(self.parent.keys(), reverse=True))[
0 : settings.pagination
]
)
@property
def renderfile(self):
return os.path.join(self.parent.renderdir, settings.paths.feed, settings.filenames.json)
return os.path.join(
self.parent.renderdir,
settings.paths.feed,
settings.filenames.json,
)
@property
def exists(self):
@ -1641,10 +1671,14 @@ class Category(dict):
async def render(self):
if self.exists:
logger.debug("category %s is up to date", self.parent.name)
logger.debug(
"category %s is up to date", self.parent.name
)
return
logger.info("rendering JSON feed for category %s", self.parent.name)
logger.info(
"rendering JSON feed for category %s", self.parent.name
)
js = {
"version": "https://jsonfeed.org/version/1",
@ -1659,7 +1693,9 @@ class Category(dict):
"items": [],
}
for key in list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination]:
for key in list(sorted(self.parent.keys(), reverse=True))[
0 : settings.pagination
]:
post = self.parent[key]
pjs = {
"id": post.url,
@ -1676,12 +1712,15 @@ class Category(dict):
"attachment": {
"url": post.photo.href,
"mime_type": post.photo.mime_type,
"size_in_bytes": f"{post.photo.mime_size}"
"size_in_bytes": f"{post.photo.mime_size}",
}
}
)
js["items"].append(pjs)
writepath(self.renderfile,json.dumps(js, indent=4, ensure_ascii=False))
writepath(
self.renderfile,
json.dumps(js, indent=4, ensure_ascii=False),
)
class XMLFeed(object):
def __init__(self, parent, feedformat="rss"):
@ -1690,7 +1729,11 @@ class Category(dict):
@property
def mtime(self):
return max(list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination])
return max(
list(sorted(self.parent.keys(), reverse=True))[
0 : settings.pagination
]
)
@property
def renderfile(self):
@ -1700,7 +1743,9 @@ class Category(dict):
fname = settings.filenames.atom
else:
fname = "index.xml"
return os.path.join(self.parent.renderdir, settings.paths.feed, fname)
return os.path.join(
self.parent.renderdir, settings.paths.feed, fname
)
@property
def exists(self):
@ -1714,10 +1759,16 @@ class Category(dict):
async def render(self):
if self.exists:
logger.debug("category %s is up to date", self.parent.name)
logger.debug(
"category %s is up to date", self.parent.name
)
return
logger.info("rendering %s feed for category %s", self.feedformat, self.parent.name)
logger.info(
"rendering %s feed for category %s",
self.feedformat,
self.parent.name,
)
fg = FeedGenerator()
fg.id(self.parent.feedurl)
@ -1732,12 +1783,14 @@ class Category(dict):
}
)
if self.feedformat == "rss":
fg.link(href=self.feedurl)
fg.link(href=self.parent.feedurl)
elif self.feedformat == "atom":
fg.link(href=self.feedurl, rel="self")
fg.link(href=self.parent.feedurl, rel="self")
fg.link(href=settings.meta.get("hub"), rel="hub")
for key in list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination]:
for key in list(sorted(self.parent.keys(), reverse=True))[
0 : settings.pagination
]:
post = self.parent[key]
fe = fg.add_entry()
@ -1787,7 +1840,6 @@ class Category(dict):
writepath(self.renderfile, fg.atom_str(pretty=True))
class Year(object):
def __init__(self, parent, year):
self.parent = parent
@ -1814,9 +1866,15 @@ class Category(dict):
@property
def renderfile(self):
if self.year == self.parent.newest_year:
return os.path.join(self.parent.renderdir, settings.filenames.html)
return os.path.join(
self.parent.renderdir, settings.filenames.html
)
else:
return os.path.join(self.parent.renderdir, self.year, settings.filenames.html)
return os.path.join(
self.parent.renderdir,
self.year,
settings.filenames.html,
)
@property
def baseurl(self):
@ -1854,19 +1912,25 @@ class Category(dict):
"title": self.parent.title,
"paginated": True,
"years": self.parent.years,
"year": self.year
"year": self.year,
},
"posts": self.posttmplvars
"posts": self.posttmplvars,
}
async def render(self):
if self.exists:
logger.debug("category %s is up to date", self.parent.name)
logger.debug(
"category %s is up to date", self.parent.name
)
return
logger.info("rendering year %s for category %s", self.year, self.parent.name)
logger.info(
"rendering year %s for category %s",
self.year,
self.parent.name,
)
r = J2.get_template(self.template).render(self.tmplvars)
writepath(self.renderfile, r)
del(r)
del r
class Flat(object):
def __init__(self, parent):
@ -1876,7 +1940,9 @@ class Category(dict):
def posttmplvars(self):
return [
self.parent[key].jsonld
for key in list(sorted(self.parent.keys(), reverse=True))
for key in list(
sorted(self.parent.keys(), reverse=True)
)
]
@property
@ -1885,7 +1951,9 @@ class Category(dict):
@property
def renderfile(self):
return os.path.join(self.parent.renderdir, settings.filenames.html)
return os.path.join(
self.parent.renderdir, settings.filenames.html
)
@property
def template(self):
@ -1915,17 +1983,19 @@ class Category(dict):
"feed": self.parent.feedurl,
"title": self.parent.title,
},
"posts": self.posttmplvars
"posts": self.posttmplvars,
}
async def render(self):
if self.exists:
logger.debug("category %s is up to date", self.parent.name)
logger.debug(
"category %s is up to date", self.parent.name
)
return
logger.info("rendering category %s", self.parent.name)
r = J2.get_template(self.template).render(self.tmplvars)
writepath(self.renderfile, r)
del(r)
del r
class Gopher(object):
def __init__(self, parent):
@ -1947,17 +2017,27 @@ class Category(dict):
@property
def renderfile(self):
return os.path.join(self.parent.renderdir, settings.filenames.gopher)
return os.path.join(
self.parent.renderdir, settings.filenames.gopher
)
async def render(self):
if self.exists:
logger.debug("category %s is up to date", self.parent.name)
logger.debug(
"category %s is up to date", self.parent.name
)
return
lines = ["%s - %s" % (self.parent.name, settings.site.name), "", ""]
lines = [
"%s - %s" % (self.parent.name, settings.site.name),
"",
"",
]
for post in [
self.parent[key]
for key in list(sorted(self.parent.keys(), reverse=True))
for key in list(
sorted(self.parent.keys(), reverse=True)
)
]:
line = "0%s\t/%s/%s\t%s\t70" % (
post.title,
@ -1979,6 +2059,7 @@ class Category(dict):
lines.append("")
writepath(self.renderfile, "\r\n".join(lines))
class Sitemap(dict):
@property
def mtime(self):
@ -2058,7 +2139,6 @@ class Webmention(object):
else:
self.save(r.text)
def backfill_syndication(self):
""" this is very specific to webmention.io and brid.gy publish """
@ -2105,9 +2185,7 @@ class Webmention(object):
if "url" in maybe["http_body"]:
data = json.loads(maybe["http_body"])
url = data["url"]
sp = os.path.join(
self.dpath, "%s.copy" % url2slug(url)
)
sp = os.path.join(self.dpath, "%s.copy" % url2slug(url))
if os.path.exists(sp):
return
with open(sp, "wt") as f:
@ -2123,6 +2201,7 @@ class Webmention(object):
)
pass
class WebmentionIO(object):
def __init__(self):
self.params = {
@ -2258,12 +2337,22 @@ def make():
frontposts = Category()
home = Home(settings.paths.get("home"))
reverse_redirects = {}
for e in glob.glob(os.path.join(content, "*", "*.url")):
post = Redirect(e)
rules.add_redirect(post.source, post.target)
if post.target not in reverse_redirects:
reverse_redirects[post.target] = []
reverse_redirects[post.target].append(post.source)
for e in sorted(
glob.glob(
os.path.join(content, "*", "*", settings.filenames.md)
)
):
post = Singular(e)
if post.url in reverse_redirects:
post.pointers = reverse_redirects[post.target]
# deal with images, if needed
for i in post.images.values():
queue.put(i.downsize())
@ -2279,11 +2368,11 @@ def make():
if post.is_future:
logger.info("%s is for the future", post.name)
continue
elif not os.path.exists(post.renderfile):
logger.debug(
"%s seems to be fist time published", post.name
)
firsttimepublished.append(post)
# elif not os.path.exists(post.renderfile):
# logger.debug(
# "%s seems to be fist time published", post.name
# )
# firsttimepublished.append(post)
# add post to search database
search.append(post)
@ -2330,7 +2419,7 @@ def make():
home.add(category, category.get(category.sortedkeys[0]))
queue.put(category.render())
#queue.put(frontposts.render_feeds())
# queue.put(frontposts.render_feeds())
queue.put(home.render())
# actually run all the render & copy tasks
queue.run()
@ -2377,9 +2466,8 @@ def make():
queue.run()
logger.info("sending webmentions finished")
for post in firsttimepublished:
queue.put(post.save_memento())
queue.put(post.save_to_archiveorg())
# for post in firsttimepublished:
# queue.put(post.save_to_archiveorg())
queue.run()

View file

@ -205,6 +205,36 @@ gones = [
"^/broadcast/wp-ffpc\.message$",
]
formerdomains = [
"cadeyrn.webporfolio.hu",
"blog.petermolnar.eu",
"petermolnar.eu",
]
formercategories = {
"article": [
"linux-tech-coding",
"diy-do-it-yourself",
"sysadmin-blog",
"sysadmin",
"szubjektiv-technika",
"wordpress"
],
"note": [
"blips",
"blog",
"r"
],
"journal": [
"blog",
],
"photo": [
"photoblog",
"fotography",
]
}
if os.path.isdir("/dev/shm") and os.access("/dev/shm", os.W_OK):
tmpdir = "/dev/shm/nasg"
else:

View file

@ -222,4 +222,9 @@
<symbol id="icon-www.flickr.com" viewBox="0 0 16 16">
<path fill="#0063dc" d="M0 8c0 2.049 1.663 3.709 3.71 3.709 2.050 0 3.713-1.66 3.713-3.709s-1.662-3.709-3.713-3.709c-2.047 0-3.71 1.66-3.71 3.709zM8.577 8c0 2.049 1.662 3.709 3.711 3.709 2.042 0 3.711-1.66 3.711-3.709s-1.661-3.709-3.709-3.709c-2.050 0-3.713 1.66-3.713 3.709z"></path>
</symbol>
<symbol id="icon-web.archive.org" viewBox="0 0 17 16">
<path d="M16 15v-1h-1v-6h1v-1h-3v1h1v6h-3v-6h1v-1h-3v1h1v6h-3v-6h1v-1h-3v1h1v6h-3v-6h1v-1h-3v1h1v6h-1v1h-1v1h17v-1h-1z"></path>
<path d="M8 0h1l8 5v1h-17v-1l8-5z"></path>
</symbol>
</svg>

Before

Width:  |  Height:  |  Size: 22 KiB

After

Width:  |  Height:  |  Size: 22 KiB

View file

@ -13,21 +13,26 @@ from collections import deque
from urllib.parse import urlparse
import settings
import arrow
from time import sleep
logger = logging.getLogger("wayback")
logger.setLevel(10)
console_handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
from pprint import pprint
RE_FIRST = re.compile(r"^\<(?P<url>[^>]+)\>; rel=\"first memento\"; datetime=\"(?P<datetime>[^\"]+).*$")
RE_FIRST = re.compile(
r"^\<(?P<url>[^>]+)\>; rel=\"first memento\"; datetime=\"(?P<datetime>[^\"]+).*$"
)
class FindWaybackURL(object):
def __init__(self, path, category="", redirects=[]):
self.path = path
self.category = category
@ -49,39 +54,74 @@ class FindWaybackURL(object):
for domain in domains:
q[f"http://{domain}/{path}/"] = True
if self.category in settings.formercategories:
categories = settings.formercategories[self.category]
categories = settings.formercategories[
self.category
]
else:
categories = []
categories.append(self.category)
for category in categories:
q[f"http://{domain}/{category}/{path}/"] = True
q[f"http://{domain}/category/{category}/{path}/"] = True
#logger.info("possible urls: %s", json.dumps(list(q.keys()), indent=4, ensure_ascii=False))
q[
f"http://{domain}/category/{category}/{path}/"
] = True
# logger.info("possible urls: %s", json.dumps(list(q.keys()), indent=4, ensure_ascii=False))
return list(q.keys())
def get_first_memento(self, url):
target = f"http://web.archive.org/web/timemap/link/{url}"
logger.info("requesting %s", url)
mementos = requests.get(target)
if not mementos.text:
return None
for memento in mementos.text.split("\n"):
m = RE_FIRST.match(memento)
if m:
return settings.nameddict({
'epoch': int(arrow.get(m.group('datetime'), "ddd, DD MMM YYYY HH:mm:ss ZZZ").to("utc").timestamp),
'url': m.group('url')
})
if mementos.status_code == requests.codes.ok:
if not len(mementos.text):
logger.debug("empty memento response for %s", target)
for memento in mementos.text.split("\n"):
m = RE_FIRST.match(memento)
if m:
r = settings.nameddict(
{
"epoch": int(
arrow.get(
m.group("datetime"),
"ddd, DD MMM YYYY HH:mm:ss ZZZ",
)
.to("utc")
.timestamp
),
"url": m.group("url"),
}
)
logger.info("found memento candidate: %s", r)
return r
else:
logger.debug(
"no first memento found at: %s", target
)
else:
logger.warning(
"request failed: %s, status: %s, txt: %s",
mementos,
mementos.status_code,
mementos.text,
)
def run(self):
l = self.possible_urls()
logging.info("running archive.org lookup for %s", self.path)
logger.info("running archive.org lookup for %s", self.path)
for url in l:
maybe = self.get_first_memento(url)
if maybe:
if maybe.epoch < self.epoch:
self.epoch = maybe.epoch
self.oldest = maybe.url
sleep(.500)
if not len(self.oldest):
logger.error("no memento found for %s", self.path)
else:
logger.info("\t\toldest found memento for %s: %s :: %s", self.path, str(arrow.get(self.epoch)), self.oldest)
logger.info(
"\t\toldest found memento for %s: %s :: %s",
self.path,
str(arrow.get(self.epoch)),
self.oldest,
)