cleanups on wayback functionality
This commit is contained in:
parent
0fcbfe0bd9
commit
9e0e58a4c6
4 changed files with 253 additions and 90 deletions
234
nasg.py
234
nasg.py
|
@ -13,7 +13,8 @@ import re
|
||||||
import asyncio
|
import asyncio
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import json
|
import json
|
||||||
#import base64
|
|
||||||
|
# import base64
|
||||||
from shutil import copy2 as cp
|
from shutil import copy2 as cp
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
@ -38,6 +39,7 @@ from pandoc import PandocMD2HTML, PandocMD2TXT, PandocHTML2TXT
|
||||||
from meta import Exif
|
from meta import Exif
|
||||||
import settings
|
import settings
|
||||||
import keys
|
import keys
|
||||||
|
import wayback
|
||||||
|
|
||||||
logger = logging.getLogger("NASG")
|
logger = logging.getLogger("NASG")
|
||||||
|
|
||||||
|
@ -140,7 +142,7 @@ def maybe_copy(source, target):
|
||||||
|
|
||||||
def extractdomain(url):
|
def extractdomain(url):
|
||||||
url = urlparse(url)
|
url = urlparse(url)
|
||||||
return url.netloc
|
return url.hostname
|
||||||
|
|
||||||
|
|
||||||
J2 = jinja2.Environment(
|
J2 = jinja2.Environment(
|
||||||
|
@ -387,6 +389,7 @@ class Comment(MarkdownDoc):
|
||||||
}
|
}
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
|
||||||
class WebImage(object):
|
class WebImage(object):
|
||||||
def __init__(self, fpath, mdimg, parent):
|
def __init__(self, fpath, mdimg, parent):
|
||||||
logger.debug("loading image: %s", fpath)
|
logger.debug("loading image: %s", fpath)
|
||||||
|
@ -603,9 +606,7 @@ class WebImage(object):
|
||||||
"Model": ["Model"],
|
"Model": ["Model"],
|
||||||
"FNumber": ["FNumber", "Aperture"],
|
"FNumber": ["FNumber", "Aperture"],
|
||||||
"ExposureTime": ["ExposureTime"],
|
"ExposureTime": ["ExposureTime"],
|
||||||
"FocalLength": [
|
"FocalLength": ["FocalLength"],
|
||||||
"FocalLength"
|
|
||||||
],
|
|
||||||
"ISO": ["ISO"],
|
"ISO": ["ISO"],
|
||||||
"LensID": ["LensID", "LensSpec", "Lens"],
|
"LensID": ["LensID", "LensSpec", "Lens"],
|
||||||
"CreateDate": ["CreateDate", "DateTimeOriginal"],
|
"CreateDate": ["CreateDate", "DateTimeOriginal"],
|
||||||
|
@ -681,14 +682,14 @@ class WebImage(object):
|
||||||
self.size = size
|
self.size = size
|
||||||
self.crop = crop
|
self.crop = crop
|
||||||
|
|
||||||
#@property
|
# @property
|
||||||
#def data(self):
|
# def data(self):
|
||||||
#with open(self.fpath, "rb") as f:
|
# with open(self.fpath, "rb") as f:
|
||||||
#encoded = base64.b64encode(f.read())
|
# encoded = base64.b64encode(f.read())
|
||||||
#return "data:%s;base64,%s" % (
|
# return "data:%s;base64,%s" % (
|
||||||
#self.parent.mime_type,
|
# self.parent.mime_type,
|
||||||
#encoded.decode("utf-8"),
|
# encoded.decode("utf-8"),
|
||||||
#)
|
# )
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def suffix(self):
|
def suffix(self):
|
||||||
|
@ -806,6 +807,7 @@ class Singular(MarkdownDoc):
|
||||||
self.dirpath = os.path.dirname(fpath)
|
self.dirpath = os.path.dirname(fpath)
|
||||||
self.name = os.path.basename(self.dirpath)
|
self.name = os.path.basename(self.dirpath)
|
||||||
self.category = os.path.basename(os.path.dirname(self.dirpath))
|
self.category = os.path.basename(os.path.dirname(self.dirpath))
|
||||||
|
self.pointers = []
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def files(self):
|
def files(self):
|
||||||
|
@ -848,7 +850,9 @@ class Singular(MarkdownDoc):
|
||||||
the Singular object
|
the Singular object
|
||||||
"""
|
"""
|
||||||
images = {}
|
images = {}
|
||||||
for match, alt, fname, title, css in RE_MDIMG.findall(self.content):
|
for match, alt, fname, title, css in RE_MDIMG.findall(
|
||||||
|
self.content
|
||||||
|
):
|
||||||
mdimg = MarkdownImage(match, alt, fname, title, css)
|
mdimg = MarkdownImage(match, alt, fname, title, css)
|
||||||
imgpath = os.path.join(self.dirpath, fname)
|
imgpath = os.path.join(self.dirpath, fname)
|
||||||
if imgpath in self.files:
|
if imgpath in self.files:
|
||||||
|
@ -1002,7 +1006,7 @@ class Singular(MarkdownDoc):
|
||||||
self.url,
|
self.url,
|
||||||
url,
|
url,
|
||||||
os.path.dirname(self.fpath),
|
os.path.dirname(self.fpath),
|
||||||
self.dt.timestamp
|
self.dt.timestamp,
|
||||||
)
|
)
|
||||||
webmentions.append(w)
|
webmentions.append(w)
|
||||||
if self.is_reply:
|
if self.is_reply:
|
||||||
|
@ -1010,7 +1014,7 @@ class Singular(MarkdownDoc):
|
||||||
self.url,
|
self.url,
|
||||||
self.is_reply,
|
self.is_reply,
|
||||||
os.path.dirname(self.fpath),
|
os.path.dirname(self.fpath),
|
||||||
self.dt.timestamp
|
self.dt.timestamp,
|
||||||
)
|
)
|
||||||
webmentions.append(w)
|
webmentions.append(w)
|
||||||
return webmentions
|
return webmentions
|
||||||
|
@ -1143,7 +1147,6 @@ class Singular(MarkdownDoc):
|
||||||
if self.event:
|
if self.event:
|
||||||
r.update({"subjectOf": self.event})
|
r.update({"subjectOf": self.event})
|
||||||
|
|
||||||
|
|
||||||
for url in list(set(self.to_syndicate)):
|
for url in list(set(self.to_syndicate)):
|
||||||
r["potentialAction"].append(
|
r["potentialAction"].append(
|
||||||
{
|
{
|
||||||
|
@ -1199,7 +1202,9 @@ class Singular(MarkdownDoc):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def corpus(self):
|
def corpus(self):
|
||||||
return "\n".join([self.title, self.name, self.summary, self.content])
|
return "\n".join(
|
||||||
|
[self.title, self.name, self.summary, self.content]
|
||||||
|
)
|
||||||
|
|
||||||
async def copy_files(self):
|
async def copy_files(self):
|
||||||
exclude = [
|
exclude = [
|
||||||
|
@ -1231,12 +1236,35 @@ class Singular(MarkdownDoc):
|
||||||
logger.info("copying '%s' to '%s'", f, t)
|
logger.info("copying '%s' to '%s'", f, t)
|
||||||
cp(f, t)
|
cp(f, t)
|
||||||
|
|
||||||
|
|
||||||
async def save_to_archiveorg(self):
|
async def save_to_archiveorg(self):
|
||||||
requests.get(f"http://web.archive.org/save/{self.url}")
|
requests.get(f"http://web.archive.org/save/{self.url}")
|
||||||
|
|
||||||
|
async def get_from_archiveorg(self):
|
||||||
|
done = glob.glob(
|
||||||
|
os.path.join(self.dirpath, f"*archiveorg*.copy")
|
||||||
|
)
|
||||||
|
if done:
|
||||||
|
logger.debug(
|
||||||
|
"archive.org .copy exists for %s at %s",
|
||||||
|
self.name,
|
||||||
|
done[0],
|
||||||
|
)
|
||||||
|
return
|
||||||
|
logger.info("trying to get archive.org .copy for %s", self.name)
|
||||||
|
if len(self.category):
|
||||||
|
wb = wayback.FindWaybackURL(
|
||||||
|
self.name, self.category, self.pointers
|
||||||
|
)
|
||||||
|
wb.run()
|
||||||
|
if len(wb.oldest):
|
||||||
|
archiveurl = url2slug(wb.oldest)
|
||||||
|
t = os.path.join(self.dirpath, f"{archiveurl}.copy")
|
||||||
|
writepath(t, wb.oldest)
|
||||||
|
|
||||||
async def render(self):
|
async def render(self):
|
||||||
|
if settings.args.get("memento"):
|
||||||
|
await self.get_from_archiveorg()
|
||||||
|
|
||||||
if self.exists:
|
if self.exists:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -1247,7 +1275,7 @@ class Singular(MarkdownDoc):
|
||||||
"site": settings.site,
|
"site": settings.site,
|
||||||
"menu": settings.menu,
|
"menu": settings.menu,
|
||||||
"meta": settings.meta,
|
"meta": settings.meta,
|
||||||
"fnames": settings.filenames
|
"fnames": settings.filenames,
|
||||||
}
|
}
|
||||||
writepath(
|
writepath(
|
||||||
self.renderfile, J2.get_template(self.template).render(v)
|
self.renderfile, J2.get_template(self.template).render(v)
|
||||||
|
@ -1260,8 +1288,7 @@ class Singular(MarkdownDoc):
|
||||||
"content": self.txt_content,
|
"content": self.txt_content,
|
||||||
}
|
}
|
||||||
writepath(
|
writepath(
|
||||||
self.txtfile,
|
self.txtfile, J2.get_template(self.txttemplate).render(g)
|
||||||
J2.get_template(self.txttemplate).render(g),
|
|
||||||
)
|
)
|
||||||
del g
|
del g
|
||||||
|
|
||||||
|
@ -1300,12 +1327,7 @@ class Home(Singular):
|
||||||
return arrow.get(ts)
|
return arrow.get(ts)
|
||||||
|
|
||||||
async def render_gopher(self):
|
async def render_gopher(self):
|
||||||
lines = [
|
lines = ["%s's gopherhole" % (settings.site.name), "", ""]
|
||||||
"%s's gopherhole"
|
|
||||||
% (settings.site.name),
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
|
|
||||||
for category, post in self.posts:
|
for category, post in self.posts:
|
||||||
line = "1%s\t/%s/%s\t%s\t70" % (
|
line = "1%s\t/%s/%s\t%s\t70" % (
|
||||||
|
@ -1585,7 +1607,7 @@ class Category(dict):
|
||||||
def renderdir(self):
|
def renderdir(self):
|
||||||
b = settings.paths.build
|
b = settings.paths.build
|
||||||
if len(self.name):
|
if len(self.name):
|
||||||
b = os.path.join(b,settings.paths.category, self.name)
|
b = os.path.join(b, settings.paths.category, self.name)
|
||||||
return b
|
return b
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -1623,11 +1645,19 @@ class Category(dict):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def mtime(self):
|
def mtime(self):
|
||||||
return max(list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination])
|
return max(
|
||||||
|
list(sorted(self.parent.keys(), reverse=True))[
|
||||||
|
0 : settings.pagination
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def renderfile(self):
|
def renderfile(self):
|
||||||
return os.path.join(self.parent.renderdir, settings.paths.feed, settings.filenames.json)
|
return os.path.join(
|
||||||
|
self.parent.renderdir,
|
||||||
|
settings.paths.feed,
|
||||||
|
settings.filenames.json,
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def exists(self):
|
def exists(self):
|
||||||
|
@ -1641,10 +1671,14 @@ class Category(dict):
|
||||||
|
|
||||||
async def render(self):
|
async def render(self):
|
||||||
if self.exists:
|
if self.exists:
|
||||||
logger.debug("category %s is up to date", self.parent.name)
|
logger.debug(
|
||||||
|
"category %s is up to date", self.parent.name
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info("rendering JSON feed for category %s", self.parent.name)
|
logger.info(
|
||||||
|
"rendering JSON feed for category %s", self.parent.name
|
||||||
|
)
|
||||||
|
|
||||||
js = {
|
js = {
|
||||||
"version": "https://jsonfeed.org/version/1",
|
"version": "https://jsonfeed.org/version/1",
|
||||||
|
@ -1659,7 +1693,9 @@ class Category(dict):
|
||||||
"items": [],
|
"items": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
for key in list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination]:
|
for key in list(sorted(self.parent.keys(), reverse=True))[
|
||||||
|
0 : settings.pagination
|
||||||
|
]:
|
||||||
post = self.parent[key]
|
post = self.parent[key]
|
||||||
pjs = {
|
pjs = {
|
||||||
"id": post.url,
|
"id": post.url,
|
||||||
|
@ -1676,12 +1712,15 @@ class Category(dict):
|
||||||
"attachment": {
|
"attachment": {
|
||||||
"url": post.photo.href,
|
"url": post.photo.href,
|
||||||
"mime_type": post.photo.mime_type,
|
"mime_type": post.photo.mime_type,
|
||||||
"size_in_bytes": f"{post.photo.mime_size}"
|
"size_in_bytes": f"{post.photo.mime_size}",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
js["items"].append(pjs)
|
js["items"].append(pjs)
|
||||||
writepath(self.renderfile,json.dumps(js, indent=4, ensure_ascii=False))
|
writepath(
|
||||||
|
self.renderfile,
|
||||||
|
json.dumps(js, indent=4, ensure_ascii=False),
|
||||||
|
)
|
||||||
|
|
||||||
class XMLFeed(object):
|
class XMLFeed(object):
|
||||||
def __init__(self, parent, feedformat="rss"):
|
def __init__(self, parent, feedformat="rss"):
|
||||||
|
@ -1690,7 +1729,11 @@ class Category(dict):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def mtime(self):
|
def mtime(self):
|
||||||
return max(list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination])
|
return max(
|
||||||
|
list(sorted(self.parent.keys(), reverse=True))[
|
||||||
|
0 : settings.pagination
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def renderfile(self):
|
def renderfile(self):
|
||||||
|
@ -1700,7 +1743,9 @@ class Category(dict):
|
||||||
fname = settings.filenames.atom
|
fname = settings.filenames.atom
|
||||||
else:
|
else:
|
||||||
fname = "index.xml"
|
fname = "index.xml"
|
||||||
return os.path.join(self.parent.renderdir, settings.paths.feed, fname)
|
return os.path.join(
|
||||||
|
self.parent.renderdir, settings.paths.feed, fname
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def exists(self):
|
def exists(self):
|
||||||
|
@ -1714,10 +1759,16 @@ class Category(dict):
|
||||||
|
|
||||||
async def render(self):
|
async def render(self):
|
||||||
if self.exists:
|
if self.exists:
|
||||||
logger.debug("category %s is up to date", self.parent.name)
|
logger.debug(
|
||||||
|
"category %s is up to date", self.parent.name
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info("rendering %s feed for category %s", self.feedformat, self.parent.name)
|
logger.info(
|
||||||
|
"rendering %s feed for category %s",
|
||||||
|
self.feedformat,
|
||||||
|
self.parent.name,
|
||||||
|
)
|
||||||
|
|
||||||
fg = FeedGenerator()
|
fg = FeedGenerator()
|
||||||
fg.id(self.parent.feedurl)
|
fg.id(self.parent.feedurl)
|
||||||
|
@ -1732,12 +1783,14 @@ class Category(dict):
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
if self.feedformat == "rss":
|
if self.feedformat == "rss":
|
||||||
fg.link(href=self.feedurl)
|
fg.link(href=self.parent.feedurl)
|
||||||
elif self.feedformat == "atom":
|
elif self.feedformat == "atom":
|
||||||
fg.link(href=self.feedurl, rel="self")
|
fg.link(href=self.parent.feedurl, rel="self")
|
||||||
fg.link(href=settings.meta.get("hub"), rel="hub")
|
fg.link(href=settings.meta.get("hub"), rel="hub")
|
||||||
|
|
||||||
for key in list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination]:
|
for key in list(sorted(self.parent.keys(), reverse=True))[
|
||||||
|
0 : settings.pagination
|
||||||
|
]:
|
||||||
post = self.parent[key]
|
post = self.parent[key]
|
||||||
fe = fg.add_entry()
|
fe = fg.add_entry()
|
||||||
|
|
||||||
|
@ -1787,7 +1840,6 @@ class Category(dict):
|
||||||
|
|
||||||
writepath(self.renderfile, fg.atom_str(pretty=True))
|
writepath(self.renderfile, fg.atom_str(pretty=True))
|
||||||
|
|
||||||
|
|
||||||
class Year(object):
|
class Year(object):
|
||||||
def __init__(self, parent, year):
|
def __init__(self, parent, year):
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
|
@ -1814,9 +1866,15 @@ class Category(dict):
|
||||||
@property
|
@property
|
||||||
def renderfile(self):
|
def renderfile(self):
|
||||||
if self.year == self.parent.newest_year:
|
if self.year == self.parent.newest_year:
|
||||||
return os.path.join(self.parent.renderdir, settings.filenames.html)
|
return os.path.join(
|
||||||
|
self.parent.renderdir, settings.filenames.html
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
return os.path.join(self.parent.renderdir, self.year, settings.filenames.html)
|
return os.path.join(
|
||||||
|
self.parent.renderdir,
|
||||||
|
self.year,
|
||||||
|
settings.filenames.html,
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def baseurl(self):
|
def baseurl(self):
|
||||||
|
@ -1854,19 +1912,25 @@ class Category(dict):
|
||||||
"title": self.parent.title,
|
"title": self.parent.title,
|
||||||
"paginated": True,
|
"paginated": True,
|
||||||
"years": self.parent.years,
|
"years": self.parent.years,
|
||||||
"year": self.year
|
"year": self.year,
|
||||||
},
|
},
|
||||||
"posts": self.posttmplvars
|
"posts": self.posttmplvars,
|
||||||
}
|
}
|
||||||
|
|
||||||
async def render(self):
|
async def render(self):
|
||||||
if self.exists:
|
if self.exists:
|
||||||
logger.debug("category %s is up to date", self.parent.name)
|
logger.debug(
|
||||||
|
"category %s is up to date", self.parent.name
|
||||||
|
)
|
||||||
return
|
return
|
||||||
logger.info("rendering year %s for category %s", self.year, self.parent.name)
|
logger.info(
|
||||||
|
"rendering year %s for category %s",
|
||||||
|
self.year,
|
||||||
|
self.parent.name,
|
||||||
|
)
|
||||||
r = J2.get_template(self.template).render(self.tmplvars)
|
r = J2.get_template(self.template).render(self.tmplvars)
|
||||||
writepath(self.renderfile, r)
|
writepath(self.renderfile, r)
|
||||||
del(r)
|
del r
|
||||||
|
|
||||||
class Flat(object):
|
class Flat(object):
|
||||||
def __init__(self, parent):
|
def __init__(self, parent):
|
||||||
|
@ -1876,7 +1940,9 @@ class Category(dict):
|
||||||
def posttmplvars(self):
|
def posttmplvars(self):
|
||||||
return [
|
return [
|
||||||
self.parent[key].jsonld
|
self.parent[key].jsonld
|
||||||
for key in list(sorted(self.parent.keys(), reverse=True))
|
for key in list(
|
||||||
|
sorted(self.parent.keys(), reverse=True)
|
||||||
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -1885,7 +1951,9 @@ class Category(dict):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def renderfile(self):
|
def renderfile(self):
|
||||||
return os.path.join(self.parent.renderdir, settings.filenames.html)
|
return os.path.join(
|
||||||
|
self.parent.renderdir, settings.filenames.html
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def template(self):
|
def template(self):
|
||||||
|
@ -1915,17 +1983,19 @@ class Category(dict):
|
||||||
"feed": self.parent.feedurl,
|
"feed": self.parent.feedurl,
|
||||||
"title": self.parent.title,
|
"title": self.parent.title,
|
||||||
},
|
},
|
||||||
"posts": self.posttmplvars
|
"posts": self.posttmplvars,
|
||||||
}
|
}
|
||||||
|
|
||||||
async def render(self):
|
async def render(self):
|
||||||
if self.exists:
|
if self.exists:
|
||||||
logger.debug("category %s is up to date", self.parent.name)
|
logger.debug(
|
||||||
|
"category %s is up to date", self.parent.name
|
||||||
|
)
|
||||||
return
|
return
|
||||||
logger.info("rendering category %s", self.parent.name)
|
logger.info("rendering category %s", self.parent.name)
|
||||||
r = J2.get_template(self.template).render(self.tmplvars)
|
r = J2.get_template(self.template).render(self.tmplvars)
|
||||||
writepath(self.renderfile, r)
|
writepath(self.renderfile, r)
|
||||||
del(r)
|
del r
|
||||||
|
|
||||||
class Gopher(object):
|
class Gopher(object):
|
||||||
def __init__(self, parent):
|
def __init__(self, parent):
|
||||||
|
@ -1947,17 +2017,27 @@ class Category(dict):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def renderfile(self):
|
def renderfile(self):
|
||||||
return os.path.join(self.parent.renderdir, settings.filenames.gopher)
|
return os.path.join(
|
||||||
|
self.parent.renderdir, settings.filenames.gopher
|
||||||
|
)
|
||||||
|
|
||||||
async def render(self):
|
async def render(self):
|
||||||
if self.exists:
|
if self.exists:
|
||||||
logger.debug("category %s is up to date", self.parent.name)
|
logger.debug(
|
||||||
|
"category %s is up to date", self.parent.name
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
lines = ["%s - %s" % (self.parent.name, settings.site.name), "", ""]
|
lines = [
|
||||||
|
"%s - %s" % (self.parent.name, settings.site.name),
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
]
|
||||||
for post in [
|
for post in [
|
||||||
self.parent[key]
|
self.parent[key]
|
||||||
for key in list(sorted(self.parent.keys(), reverse=True))
|
for key in list(
|
||||||
|
sorted(self.parent.keys(), reverse=True)
|
||||||
|
)
|
||||||
]:
|
]:
|
||||||
line = "0%s\t/%s/%s\t%s\t70" % (
|
line = "0%s\t/%s/%s\t%s\t70" % (
|
||||||
post.title,
|
post.title,
|
||||||
|
@ -1979,6 +2059,7 @@ class Category(dict):
|
||||||
lines.append("")
|
lines.append("")
|
||||||
writepath(self.renderfile, "\r\n".join(lines))
|
writepath(self.renderfile, "\r\n".join(lines))
|
||||||
|
|
||||||
|
|
||||||
class Sitemap(dict):
|
class Sitemap(dict):
|
||||||
@property
|
@property
|
||||||
def mtime(self):
|
def mtime(self):
|
||||||
|
@ -2058,7 +2139,6 @@ class Webmention(object):
|
||||||
else:
|
else:
|
||||||
self.save(r.text)
|
self.save(r.text)
|
||||||
|
|
||||||
|
|
||||||
def backfill_syndication(self):
|
def backfill_syndication(self):
|
||||||
""" this is very specific to webmention.io and brid.gy publish """
|
""" this is very specific to webmention.io and brid.gy publish """
|
||||||
|
|
||||||
|
@ -2105,9 +2185,7 @@ class Webmention(object):
|
||||||
if "url" in maybe["http_body"]:
|
if "url" in maybe["http_body"]:
|
||||||
data = json.loads(maybe["http_body"])
|
data = json.loads(maybe["http_body"])
|
||||||
url = data["url"]
|
url = data["url"]
|
||||||
sp = os.path.join(
|
sp = os.path.join(self.dpath, "%s.copy" % url2slug(url))
|
||||||
self.dpath, "%s.copy" % url2slug(url)
|
|
||||||
)
|
|
||||||
if os.path.exists(sp):
|
if os.path.exists(sp):
|
||||||
return
|
return
|
||||||
with open(sp, "wt") as f:
|
with open(sp, "wt") as f:
|
||||||
|
@ -2123,6 +2201,7 @@ class Webmention(object):
|
||||||
)
|
)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class WebmentionIO(object):
|
class WebmentionIO(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.params = {
|
self.params = {
|
||||||
|
@ -2258,12 +2337,22 @@ def make():
|
||||||
frontposts = Category()
|
frontposts = Category()
|
||||||
home = Home(settings.paths.get("home"))
|
home = Home(settings.paths.get("home"))
|
||||||
|
|
||||||
|
reverse_redirects = {}
|
||||||
|
for e in glob.glob(os.path.join(content, "*", "*.url")):
|
||||||
|
post = Redirect(e)
|
||||||
|
rules.add_redirect(post.source, post.target)
|
||||||
|
if post.target not in reverse_redirects:
|
||||||
|
reverse_redirects[post.target] = []
|
||||||
|
reverse_redirects[post.target].append(post.source)
|
||||||
|
|
||||||
for e in sorted(
|
for e in sorted(
|
||||||
glob.glob(
|
glob.glob(
|
||||||
os.path.join(content, "*", "*", settings.filenames.md)
|
os.path.join(content, "*", "*", settings.filenames.md)
|
||||||
)
|
)
|
||||||
):
|
):
|
||||||
post = Singular(e)
|
post = Singular(e)
|
||||||
|
if post.url in reverse_redirects:
|
||||||
|
post.pointers = reverse_redirects[post.target]
|
||||||
# deal with images, if needed
|
# deal with images, if needed
|
||||||
for i in post.images.values():
|
for i in post.images.values():
|
||||||
queue.put(i.downsize())
|
queue.put(i.downsize())
|
||||||
|
@ -2279,11 +2368,11 @@ def make():
|
||||||
if post.is_future:
|
if post.is_future:
|
||||||
logger.info("%s is for the future", post.name)
|
logger.info("%s is for the future", post.name)
|
||||||
continue
|
continue
|
||||||
elif not os.path.exists(post.renderfile):
|
# elif not os.path.exists(post.renderfile):
|
||||||
logger.debug(
|
# logger.debug(
|
||||||
"%s seems to be fist time published", post.name
|
# "%s seems to be fist time published", post.name
|
||||||
)
|
# )
|
||||||
firsttimepublished.append(post)
|
# firsttimepublished.append(post)
|
||||||
|
|
||||||
# add post to search database
|
# add post to search database
|
||||||
search.append(post)
|
search.append(post)
|
||||||
|
@ -2330,7 +2419,7 @@ def make():
|
||||||
home.add(category, category.get(category.sortedkeys[0]))
|
home.add(category, category.get(category.sortedkeys[0]))
|
||||||
queue.put(category.render())
|
queue.put(category.render())
|
||||||
|
|
||||||
#queue.put(frontposts.render_feeds())
|
# queue.put(frontposts.render_feeds())
|
||||||
queue.put(home.render())
|
queue.put(home.render())
|
||||||
# actually run all the render & copy tasks
|
# actually run all the render & copy tasks
|
||||||
queue.run()
|
queue.run()
|
||||||
|
@ -2377,9 +2466,8 @@ def make():
|
||||||
queue.run()
|
queue.run()
|
||||||
logger.info("sending webmentions finished")
|
logger.info("sending webmentions finished")
|
||||||
|
|
||||||
for post in firsttimepublished:
|
# for post in firsttimepublished:
|
||||||
queue.put(post.save_memento())
|
# queue.put(post.save_to_archiveorg())
|
||||||
queue.put(post.save_to_archiveorg())
|
|
||||||
queue.run()
|
queue.run()
|
||||||
|
|
||||||
|
|
||||||
|
|
30
settings.py
30
settings.py
|
@ -205,6 +205,36 @@ gones = [
|
||||||
"^/broadcast/wp-ffpc\.message$",
|
"^/broadcast/wp-ffpc\.message$",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
formerdomains = [
|
||||||
|
"cadeyrn.webporfolio.hu",
|
||||||
|
"blog.petermolnar.eu",
|
||||||
|
"petermolnar.eu",
|
||||||
|
]
|
||||||
|
|
||||||
|
formercategories = {
|
||||||
|
"article": [
|
||||||
|
"linux-tech-coding",
|
||||||
|
"diy-do-it-yourself",
|
||||||
|
"sysadmin-blog",
|
||||||
|
"sysadmin",
|
||||||
|
"szubjektiv-technika",
|
||||||
|
"wordpress"
|
||||||
|
],
|
||||||
|
"note": [
|
||||||
|
"blips",
|
||||||
|
"blog",
|
||||||
|
"r"
|
||||||
|
],
|
||||||
|
"journal": [
|
||||||
|
"blog",
|
||||||
|
],
|
||||||
|
"photo": [
|
||||||
|
"photoblog",
|
||||||
|
"fotography",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if os.path.isdir("/dev/shm") and os.access("/dev/shm", os.W_OK):
|
if os.path.isdir("/dev/shm") and os.access("/dev/shm", os.W_OK):
|
||||||
tmpdir = "/dev/shm/nasg"
|
tmpdir = "/dev/shm/nasg"
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -222,4 +222,9 @@
|
||||||
<symbol id="icon-www.flickr.com" viewBox="0 0 16 16">
|
<symbol id="icon-www.flickr.com" viewBox="0 0 16 16">
|
||||||
<path fill="#0063dc" d="M0 8c0 2.049 1.663 3.709 3.71 3.709 2.050 0 3.713-1.66 3.713-3.709s-1.662-3.709-3.713-3.709c-2.047 0-3.71 1.66-3.71 3.709zM8.577 8c0 2.049 1.662 3.709 3.711 3.709 2.042 0 3.711-1.66 3.711-3.709s-1.661-3.709-3.709-3.709c-2.050 0-3.713 1.66-3.713 3.709z"></path>
|
<path fill="#0063dc" d="M0 8c0 2.049 1.663 3.709 3.71 3.709 2.050 0 3.713-1.66 3.713-3.709s-1.662-3.709-3.713-3.709c-2.047 0-3.71 1.66-3.71 3.709zM8.577 8c0 2.049 1.662 3.709 3.711 3.709 2.042 0 3.711-1.66 3.711-3.709s-1.661-3.709-3.709-3.709c-2.050 0-3.713 1.66-3.713 3.709z"></path>
|
||||||
</symbol>
|
</symbol>
|
||||||
|
<symbol id="icon-web.archive.org" viewBox="0 0 17 16">
|
||||||
|
<path d="M16 15v-1h-1v-6h1v-1h-3v1h1v6h-3v-6h1v-1h-3v1h1v6h-3v-6h1v-1h-3v1h1v6h-3v-6h1v-1h-3v1h1v6h-1v1h-1v1h17v-1h-1z"></path>
|
||||||
|
<path d="M8 0h1l8 5v1h-17v-1l8-5z"></path>
|
||||||
|
</symbol>
|
||||||
|
|
||||||
</svg>
|
</svg>
|
||||||
|
|
Before Width: | Height: | Size: 22 KiB After Width: | Height: | Size: 22 KiB |
68
wayback.py
68
wayback.py
|
@ -13,21 +13,26 @@ from collections import deque
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import settings
|
import settings
|
||||||
import arrow
|
import arrow
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
logger = logging.getLogger("wayback")
|
logger = logging.getLogger("wayback")
|
||||||
logger.setLevel(10)
|
logger.setLevel(10)
|
||||||
|
|
||||||
console_handler = logging.StreamHandler()
|
console_handler = logging.StreamHandler()
|
||||||
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
formatter = logging.Formatter(
|
||||||
|
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
|
)
|
||||||
console_handler.setFormatter(formatter)
|
console_handler.setFormatter(formatter)
|
||||||
logger.addHandler(console_handler)
|
logger.addHandler(console_handler)
|
||||||
|
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
RE_FIRST = re.compile(r"^\<(?P<url>[^>]+)\>; rel=\"first memento\"; datetime=\"(?P<datetime>[^\"]+).*$")
|
RE_FIRST = re.compile(
|
||||||
|
r"^\<(?P<url>[^>]+)\>; rel=\"first memento\"; datetime=\"(?P<datetime>[^\"]+).*$"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class FindWaybackURL(object):
|
class FindWaybackURL(object):
|
||||||
|
|
||||||
def __init__(self, path, category="", redirects=[]):
|
def __init__(self, path, category="", redirects=[]):
|
||||||
self.path = path
|
self.path = path
|
||||||
self.category = category
|
self.category = category
|
||||||
|
@ -49,39 +54,74 @@ class FindWaybackURL(object):
|
||||||
for domain in domains:
|
for domain in domains:
|
||||||
q[f"http://{domain}/{path}/"] = True
|
q[f"http://{domain}/{path}/"] = True
|
||||||
if self.category in settings.formercategories:
|
if self.category in settings.formercategories:
|
||||||
categories = settings.formercategories[self.category]
|
categories = settings.formercategories[
|
||||||
|
self.category
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
categories = []
|
categories = []
|
||||||
categories.append(self.category)
|
categories.append(self.category)
|
||||||
for category in categories:
|
for category in categories:
|
||||||
q[f"http://{domain}/{category}/{path}/"] = True
|
q[f"http://{domain}/{category}/{path}/"] = True
|
||||||
q[f"http://{domain}/category/{category}/{path}/"] = True
|
q[
|
||||||
#logger.info("possible urls: %s", json.dumps(list(q.keys()), indent=4, ensure_ascii=False))
|
f"http://{domain}/category/{category}/{path}/"
|
||||||
|
] = True
|
||||||
|
# logger.info("possible urls: %s", json.dumps(list(q.keys()), indent=4, ensure_ascii=False))
|
||||||
return list(q.keys())
|
return list(q.keys())
|
||||||
|
|
||||||
def get_first_memento(self, url):
|
def get_first_memento(self, url):
|
||||||
target = f"http://web.archive.org/web/timemap/link/{url}"
|
target = f"http://web.archive.org/web/timemap/link/{url}"
|
||||||
|
logger.info("requesting %s", url)
|
||||||
mementos = requests.get(target)
|
mementos = requests.get(target)
|
||||||
if not mementos.text:
|
if mementos.status_code == requests.codes.ok:
|
||||||
return None
|
if not len(mementos.text):
|
||||||
|
logger.debug("empty memento response for %s", target)
|
||||||
for memento in mementos.text.split("\n"):
|
for memento in mementos.text.split("\n"):
|
||||||
m = RE_FIRST.match(memento)
|
m = RE_FIRST.match(memento)
|
||||||
if m:
|
if m:
|
||||||
return settings.nameddict({
|
|
||||||
'epoch': int(arrow.get(m.group('datetime'), "ddd, DD MMM YYYY HH:mm:ss ZZZ").to("utc").timestamp),
|
r = settings.nameddict(
|
||||||
'url': m.group('url')
|
{
|
||||||
})
|
"epoch": int(
|
||||||
|
arrow.get(
|
||||||
|
m.group("datetime"),
|
||||||
|
"ddd, DD MMM YYYY HH:mm:ss ZZZ",
|
||||||
|
)
|
||||||
|
.to("utc")
|
||||||
|
.timestamp
|
||||||
|
),
|
||||||
|
"url": m.group("url"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
logger.info("found memento candidate: %s", r)
|
||||||
|
return r
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
"no first memento found at: %s", target
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
"request failed: %s, status: %s, txt: %s",
|
||||||
|
mementos,
|
||||||
|
mementos.status_code,
|
||||||
|
mementos.text,
|
||||||
|
)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
l = self.possible_urls()
|
l = self.possible_urls()
|
||||||
logging.info("running archive.org lookup for %s", self.path)
|
logger.info("running archive.org lookup for %s", self.path)
|
||||||
for url in l:
|
for url in l:
|
||||||
maybe = self.get_first_memento(url)
|
maybe = self.get_first_memento(url)
|
||||||
if maybe:
|
if maybe:
|
||||||
if maybe.epoch < self.epoch:
|
if maybe.epoch < self.epoch:
|
||||||
self.epoch = maybe.epoch
|
self.epoch = maybe.epoch
|
||||||
self.oldest = maybe.url
|
self.oldest = maybe.url
|
||||||
|
sleep(.500)
|
||||||
if not len(self.oldest):
|
if not len(self.oldest):
|
||||||
logger.error("no memento found for %s", self.path)
|
logger.error("no memento found for %s", self.path)
|
||||||
else:
|
else:
|
||||||
logger.info("\t\toldest found memento for %s: %s :: %s", self.path, str(arrow.get(self.epoch)), self.oldest)
|
logger.info(
|
||||||
|
"\t\toldest found memento for %s: %s :: %s",
|
||||||
|
self.path,
|
||||||
|
str(arrow.get(self.epoch)),
|
||||||
|
self.oldest,
|
||||||
|
)
|
||||||
|
|
Loading…
Reference in a new issue