cleanups on wayback functionality

This commit is contained in:
Peter Molnar 2019-08-14 11:28:01 +01:00
parent 0fcbfe0bd9
commit 9e0e58a4c6
4 changed files with 253 additions and 90 deletions

234
nasg.py
View file

@ -13,7 +13,8 @@ import re
import asyncio import asyncio
import sqlite3 import sqlite3
import json import json
#import base64
# import base64
from shutil import copy2 as cp from shutil import copy2 as cp
from urllib.parse import urlparse from urllib.parse import urlparse
from collections import namedtuple from collections import namedtuple
@ -38,6 +39,7 @@ from pandoc import PandocMD2HTML, PandocMD2TXT, PandocHTML2TXT
from meta import Exif from meta import Exif
import settings import settings
import keys import keys
import wayback
logger = logging.getLogger("NASG") logger = logging.getLogger("NASG")
@ -140,7 +142,7 @@ def maybe_copy(source, target):
def extractdomain(url): def extractdomain(url):
url = urlparse(url) url = urlparse(url)
return url.netloc return url.hostname
J2 = jinja2.Environment( J2 = jinja2.Environment(
@ -387,6 +389,7 @@ class Comment(MarkdownDoc):
} }
return r return r
class WebImage(object): class WebImage(object):
def __init__(self, fpath, mdimg, parent): def __init__(self, fpath, mdimg, parent):
logger.debug("loading image: %s", fpath) logger.debug("loading image: %s", fpath)
@ -603,9 +606,7 @@ class WebImage(object):
"Model": ["Model"], "Model": ["Model"],
"FNumber": ["FNumber", "Aperture"], "FNumber": ["FNumber", "Aperture"],
"ExposureTime": ["ExposureTime"], "ExposureTime": ["ExposureTime"],
"FocalLength": [ "FocalLength": ["FocalLength"],
"FocalLength"
],
"ISO": ["ISO"], "ISO": ["ISO"],
"LensID": ["LensID", "LensSpec", "Lens"], "LensID": ["LensID", "LensSpec", "Lens"],
"CreateDate": ["CreateDate", "DateTimeOriginal"], "CreateDate": ["CreateDate", "DateTimeOriginal"],
@ -681,14 +682,14 @@ class WebImage(object):
self.size = size self.size = size
self.crop = crop self.crop = crop
#@property # @property
#def data(self): # def data(self):
#with open(self.fpath, "rb") as f: # with open(self.fpath, "rb") as f:
#encoded = base64.b64encode(f.read()) # encoded = base64.b64encode(f.read())
#return "data:%s;base64,%s" % ( # return "data:%s;base64,%s" % (
#self.parent.mime_type, # self.parent.mime_type,
#encoded.decode("utf-8"), # encoded.decode("utf-8"),
#) # )
@property @property
def suffix(self): def suffix(self):
@ -806,6 +807,7 @@ class Singular(MarkdownDoc):
self.dirpath = os.path.dirname(fpath) self.dirpath = os.path.dirname(fpath)
self.name = os.path.basename(self.dirpath) self.name = os.path.basename(self.dirpath)
self.category = os.path.basename(os.path.dirname(self.dirpath)) self.category = os.path.basename(os.path.dirname(self.dirpath))
self.pointers = []
@cached_property @cached_property
def files(self): def files(self):
@ -848,7 +850,9 @@ class Singular(MarkdownDoc):
the Singular object the Singular object
""" """
images = {} images = {}
for match, alt, fname, title, css in RE_MDIMG.findall(self.content): for match, alt, fname, title, css in RE_MDIMG.findall(
self.content
):
mdimg = MarkdownImage(match, alt, fname, title, css) mdimg = MarkdownImage(match, alt, fname, title, css)
imgpath = os.path.join(self.dirpath, fname) imgpath = os.path.join(self.dirpath, fname)
if imgpath in self.files: if imgpath in self.files:
@ -1002,7 +1006,7 @@ class Singular(MarkdownDoc):
self.url, self.url,
url, url,
os.path.dirname(self.fpath), os.path.dirname(self.fpath),
self.dt.timestamp self.dt.timestamp,
) )
webmentions.append(w) webmentions.append(w)
if self.is_reply: if self.is_reply:
@ -1010,7 +1014,7 @@ class Singular(MarkdownDoc):
self.url, self.url,
self.is_reply, self.is_reply,
os.path.dirname(self.fpath), os.path.dirname(self.fpath),
self.dt.timestamp self.dt.timestamp,
) )
webmentions.append(w) webmentions.append(w)
return webmentions return webmentions
@ -1143,7 +1147,6 @@ class Singular(MarkdownDoc):
if self.event: if self.event:
r.update({"subjectOf": self.event}) r.update({"subjectOf": self.event})
for url in list(set(self.to_syndicate)): for url in list(set(self.to_syndicate)):
r["potentialAction"].append( r["potentialAction"].append(
{ {
@ -1199,7 +1202,9 @@ class Singular(MarkdownDoc):
@property @property
def corpus(self): def corpus(self):
return "\n".join([self.title, self.name, self.summary, self.content]) return "\n".join(
[self.title, self.name, self.summary, self.content]
)
async def copy_files(self): async def copy_files(self):
exclude = [ exclude = [
@ -1231,12 +1236,35 @@ class Singular(MarkdownDoc):
logger.info("copying '%s' to '%s'", f, t) logger.info("copying '%s' to '%s'", f, t)
cp(f, t) cp(f, t)
async def save_to_archiveorg(self): async def save_to_archiveorg(self):
requests.get(f"http://web.archive.org/save/{self.url}") requests.get(f"http://web.archive.org/save/{self.url}")
async def get_from_archiveorg(self):
done = glob.glob(
os.path.join(self.dirpath, f"*archiveorg*.copy")
)
if done:
logger.debug(
"archive.org .copy exists for %s at %s",
self.name,
done[0],
)
return
logger.info("trying to get archive.org .copy for %s", self.name)
if len(self.category):
wb = wayback.FindWaybackURL(
self.name, self.category, self.pointers
)
wb.run()
if len(wb.oldest):
archiveurl = url2slug(wb.oldest)
t = os.path.join(self.dirpath, f"{archiveurl}.copy")
writepath(t, wb.oldest)
async def render(self): async def render(self):
if settings.args.get("memento"):
await self.get_from_archiveorg()
if self.exists: if self.exists:
return True return True
@ -1247,7 +1275,7 @@ class Singular(MarkdownDoc):
"site": settings.site, "site": settings.site,
"menu": settings.menu, "menu": settings.menu,
"meta": settings.meta, "meta": settings.meta,
"fnames": settings.filenames "fnames": settings.filenames,
} }
writepath( writepath(
self.renderfile, J2.get_template(self.template).render(v) self.renderfile, J2.get_template(self.template).render(v)
@ -1260,8 +1288,7 @@ class Singular(MarkdownDoc):
"content": self.txt_content, "content": self.txt_content,
} }
writepath( writepath(
self.txtfile, self.txtfile, J2.get_template(self.txttemplate).render(g)
J2.get_template(self.txttemplate).render(g),
) )
del g del g
@ -1300,12 +1327,7 @@ class Home(Singular):
return arrow.get(ts) return arrow.get(ts)
async def render_gopher(self): async def render_gopher(self):
lines = [ lines = ["%s's gopherhole" % (settings.site.name), "", ""]
"%s's gopherhole"
% (settings.site.name),
"",
"",
]
for category, post in self.posts: for category, post in self.posts:
line = "1%s\t/%s/%s\t%s\t70" % ( line = "1%s\t/%s/%s\t%s\t70" % (
@ -1585,7 +1607,7 @@ class Category(dict):
def renderdir(self): def renderdir(self):
b = settings.paths.build b = settings.paths.build
if len(self.name): if len(self.name):
b = os.path.join(b,settings.paths.category, self.name) b = os.path.join(b, settings.paths.category, self.name)
return b return b
@property @property
@ -1623,11 +1645,19 @@ class Category(dict):
@property @property
def mtime(self): def mtime(self):
return max(list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination]) return max(
list(sorted(self.parent.keys(), reverse=True))[
0 : settings.pagination
]
)
@property @property
def renderfile(self): def renderfile(self):
return os.path.join(self.parent.renderdir, settings.paths.feed, settings.filenames.json) return os.path.join(
self.parent.renderdir,
settings.paths.feed,
settings.filenames.json,
)
@property @property
def exists(self): def exists(self):
@ -1641,10 +1671,14 @@ class Category(dict):
async def render(self): async def render(self):
if self.exists: if self.exists:
logger.debug("category %s is up to date", self.parent.name) logger.debug(
"category %s is up to date", self.parent.name
)
return return
logger.info("rendering JSON feed for category %s", self.parent.name) logger.info(
"rendering JSON feed for category %s", self.parent.name
)
js = { js = {
"version": "https://jsonfeed.org/version/1", "version": "https://jsonfeed.org/version/1",
@ -1659,7 +1693,9 @@ class Category(dict):
"items": [], "items": [],
} }
for key in list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination]: for key in list(sorted(self.parent.keys(), reverse=True))[
0 : settings.pagination
]:
post = self.parent[key] post = self.parent[key]
pjs = { pjs = {
"id": post.url, "id": post.url,
@ -1676,12 +1712,15 @@ class Category(dict):
"attachment": { "attachment": {
"url": post.photo.href, "url": post.photo.href,
"mime_type": post.photo.mime_type, "mime_type": post.photo.mime_type,
"size_in_bytes": f"{post.photo.mime_size}" "size_in_bytes": f"{post.photo.mime_size}",
} }
} }
) )
js["items"].append(pjs) js["items"].append(pjs)
writepath(self.renderfile,json.dumps(js, indent=4, ensure_ascii=False)) writepath(
self.renderfile,
json.dumps(js, indent=4, ensure_ascii=False),
)
class XMLFeed(object): class XMLFeed(object):
def __init__(self, parent, feedformat="rss"): def __init__(self, parent, feedformat="rss"):
@ -1690,7 +1729,11 @@ class Category(dict):
@property @property
def mtime(self): def mtime(self):
return max(list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination]) return max(
list(sorted(self.parent.keys(), reverse=True))[
0 : settings.pagination
]
)
@property @property
def renderfile(self): def renderfile(self):
@ -1700,7 +1743,9 @@ class Category(dict):
fname = settings.filenames.atom fname = settings.filenames.atom
else: else:
fname = "index.xml" fname = "index.xml"
return os.path.join(self.parent.renderdir, settings.paths.feed, fname) return os.path.join(
self.parent.renderdir, settings.paths.feed, fname
)
@property @property
def exists(self): def exists(self):
@ -1714,10 +1759,16 @@ class Category(dict):
async def render(self): async def render(self):
if self.exists: if self.exists:
logger.debug("category %s is up to date", self.parent.name) logger.debug(
"category %s is up to date", self.parent.name
)
return return
logger.info("rendering %s feed for category %s", self.feedformat, self.parent.name) logger.info(
"rendering %s feed for category %s",
self.feedformat,
self.parent.name,
)
fg = FeedGenerator() fg = FeedGenerator()
fg.id(self.parent.feedurl) fg.id(self.parent.feedurl)
@ -1732,12 +1783,14 @@ class Category(dict):
} }
) )
if self.feedformat == "rss": if self.feedformat == "rss":
fg.link(href=self.feedurl) fg.link(href=self.parent.feedurl)
elif self.feedformat == "atom": elif self.feedformat == "atom":
fg.link(href=self.feedurl, rel="self") fg.link(href=self.parent.feedurl, rel="self")
fg.link(href=settings.meta.get("hub"), rel="hub") fg.link(href=settings.meta.get("hub"), rel="hub")
for key in list(sorted(self.parent.keys(), reverse=True))[0:settings.pagination]: for key in list(sorted(self.parent.keys(), reverse=True))[
0 : settings.pagination
]:
post = self.parent[key] post = self.parent[key]
fe = fg.add_entry() fe = fg.add_entry()
@ -1787,7 +1840,6 @@ class Category(dict):
writepath(self.renderfile, fg.atom_str(pretty=True)) writepath(self.renderfile, fg.atom_str(pretty=True))
class Year(object): class Year(object):
def __init__(self, parent, year): def __init__(self, parent, year):
self.parent = parent self.parent = parent
@ -1814,9 +1866,15 @@ class Category(dict):
@property @property
def renderfile(self): def renderfile(self):
if self.year == self.parent.newest_year: if self.year == self.parent.newest_year:
return os.path.join(self.parent.renderdir, settings.filenames.html) return os.path.join(
self.parent.renderdir, settings.filenames.html
)
else: else:
return os.path.join(self.parent.renderdir, self.year, settings.filenames.html) return os.path.join(
self.parent.renderdir,
self.year,
settings.filenames.html,
)
@property @property
def baseurl(self): def baseurl(self):
@ -1854,19 +1912,25 @@ class Category(dict):
"title": self.parent.title, "title": self.parent.title,
"paginated": True, "paginated": True,
"years": self.parent.years, "years": self.parent.years,
"year": self.year "year": self.year,
}, },
"posts": self.posttmplvars "posts": self.posttmplvars,
} }
async def render(self): async def render(self):
if self.exists: if self.exists:
logger.debug("category %s is up to date", self.parent.name) logger.debug(
"category %s is up to date", self.parent.name
)
return return
logger.info("rendering year %s for category %s", self.year, self.parent.name) logger.info(
"rendering year %s for category %s",
self.year,
self.parent.name,
)
r = J2.get_template(self.template).render(self.tmplvars) r = J2.get_template(self.template).render(self.tmplvars)
writepath(self.renderfile, r) writepath(self.renderfile, r)
del(r) del r
class Flat(object): class Flat(object):
def __init__(self, parent): def __init__(self, parent):
@ -1876,7 +1940,9 @@ class Category(dict):
def posttmplvars(self): def posttmplvars(self):
return [ return [
self.parent[key].jsonld self.parent[key].jsonld
for key in list(sorted(self.parent.keys(), reverse=True)) for key in list(
sorted(self.parent.keys(), reverse=True)
)
] ]
@property @property
@ -1885,7 +1951,9 @@ class Category(dict):
@property @property
def renderfile(self): def renderfile(self):
return os.path.join(self.parent.renderdir, settings.filenames.html) return os.path.join(
self.parent.renderdir, settings.filenames.html
)
@property @property
def template(self): def template(self):
@ -1915,17 +1983,19 @@ class Category(dict):
"feed": self.parent.feedurl, "feed": self.parent.feedurl,
"title": self.parent.title, "title": self.parent.title,
}, },
"posts": self.posttmplvars "posts": self.posttmplvars,
} }
async def render(self): async def render(self):
if self.exists: if self.exists:
logger.debug("category %s is up to date", self.parent.name) logger.debug(
"category %s is up to date", self.parent.name
)
return return
logger.info("rendering category %s", self.parent.name) logger.info("rendering category %s", self.parent.name)
r = J2.get_template(self.template).render(self.tmplvars) r = J2.get_template(self.template).render(self.tmplvars)
writepath(self.renderfile, r) writepath(self.renderfile, r)
del(r) del r
class Gopher(object): class Gopher(object):
def __init__(self, parent): def __init__(self, parent):
@ -1947,17 +2017,27 @@ class Category(dict):
@property @property
def renderfile(self): def renderfile(self):
return os.path.join(self.parent.renderdir, settings.filenames.gopher) return os.path.join(
self.parent.renderdir, settings.filenames.gopher
)
async def render(self): async def render(self):
if self.exists: if self.exists:
logger.debug("category %s is up to date", self.parent.name) logger.debug(
"category %s is up to date", self.parent.name
)
return return
lines = ["%s - %s" % (self.parent.name, settings.site.name), "", ""] lines = [
"%s - %s" % (self.parent.name, settings.site.name),
"",
"",
]
for post in [ for post in [
self.parent[key] self.parent[key]
for key in list(sorted(self.parent.keys(), reverse=True)) for key in list(
sorted(self.parent.keys(), reverse=True)
)
]: ]:
line = "0%s\t/%s/%s\t%s\t70" % ( line = "0%s\t/%s/%s\t%s\t70" % (
post.title, post.title,
@ -1979,6 +2059,7 @@ class Category(dict):
lines.append("") lines.append("")
writepath(self.renderfile, "\r\n".join(lines)) writepath(self.renderfile, "\r\n".join(lines))
class Sitemap(dict): class Sitemap(dict):
@property @property
def mtime(self): def mtime(self):
@ -2058,7 +2139,6 @@ class Webmention(object):
else: else:
self.save(r.text) self.save(r.text)
def backfill_syndication(self): def backfill_syndication(self):
""" this is very specific to webmention.io and brid.gy publish """ """ this is very specific to webmention.io and brid.gy publish """
@ -2105,9 +2185,7 @@ class Webmention(object):
if "url" in maybe["http_body"]: if "url" in maybe["http_body"]:
data = json.loads(maybe["http_body"]) data = json.loads(maybe["http_body"])
url = data["url"] url = data["url"]
sp = os.path.join( sp = os.path.join(self.dpath, "%s.copy" % url2slug(url))
self.dpath, "%s.copy" % url2slug(url)
)
if os.path.exists(sp): if os.path.exists(sp):
return return
with open(sp, "wt") as f: with open(sp, "wt") as f:
@ -2123,6 +2201,7 @@ class Webmention(object):
) )
pass pass
class WebmentionIO(object): class WebmentionIO(object):
def __init__(self): def __init__(self):
self.params = { self.params = {
@ -2258,12 +2337,22 @@ def make():
frontposts = Category() frontposts = Category()
home = Home(settings.paths.get("home")) home = Home(settings.paths.get("home"))
reverse_redirects = {}
for e in glob.glob(os.path.join(content, "*", "*.url")):
post = Redirect(e)
rules.add_redirect(post.source, post.target)
if post.target not in reverse_redirects:
reverse_redirects[post.target] = []
reverse_redirects[post.target].append(post.source)
for e in sorted( for e in sorted(
glob.glob( glob.glob(
os.path.join(content, "*", "*", settings.filenames.md) os.path.join(content, "*", "*", settings.filenames.md)
) )
): ):
post = Singular(e) post = Singular(e)
if post.url in reverse_redirects:
post.pointers = reverse_redirects[post.target]
# deal with images, if needed # deal with images, if needed
for i in post.images.values(): for i in post.images.values():
queue.put(i.downsize()) queue.put(i.downsize())
@ -2279,11 +2368,11 @@ def make():
if post.is_future: if post.is_future:
logger.info("%s is for the future", post.name) logger.info("%s is for the future", post.name)
continue continue
elif not os.path.exists(post.renderfile): # elif not os.path.exists(post.renderfile):
logger.debug( # logger.debug(
"%s seems to be fist time published", post.name # "%s seems to be fist time published", post.name
) # )
firsttimepublished.append(post) # firsttimepublished.append(post)
# add post to search database # add post to search database
search.append(post) search.append(post)
@ -2330,7 +2419,7 @@ def make():
home.add(category, category.get(category.sortedkeys[0])) home.add(category, category.get(category.sortedkeys[0]))
queue.put(category.render()) queue.put(category.render())
#queue.put(frontposts.render_feeds()) # queue.put(frontposts.render_feeds())
queue.put(home.render()) queue.put(home.render())
# actually run all the render & copy tasks # actually run all the render & copy tasks
queue.run() queue.run()
@ -2377,9 +2466,8 @@ def make():
queue.run() queue.run()
logger.info("sending webmentions finished") logger.info("sending webmentions finished")
for post in firsttimepublished: # for post in firsttimepublished:
queue.put(post.save_memento()) # queue.put(post.save_to_archiveorg())
queue.put(post.save_to_archiveorg())
queue.run() queue.run()

View file

@ -205,6 +205,36 @@ gones = [
"^/broadcast/wp-ffpc\.message$", "^/broadcast/wp-ffpc\.message$",
] ]
formerdomains = [
"cadeyrn.webporfolio.hu",
"blog.petermolnar.eu",
"petermolnar.eu",
]
formercategories = {
"article": [
"linux-tech-coding",
"diy-do-it-yourself",
"sysadmin-blog",
"sysadmin",
"szubjektiv-technika",
"wordpress"
],
"note": [
"blips",
"blog",
"r"
],
"journal": [
"blog",
],
"photo": [
"photoblog",
"fotography",
]
}
if os.path.isdir("/dev/shm") and os.access("/dev/shm", os.W_OK): if os.path.isdir("/dev/shm") and os.access("/dev/shm", os.W_OK):
tmpdir = "/dev/shm/nasg" tmpdir = "/dev/shm/nasg"
else: else:

View file

@ -222,4 +222,9 @@
<symbol id="icon-www.flickr.com" viewBox="0 0 16 16"> <symbol id="icon-www.flickr.com" viewBox="0 0 16 16">
<path fill="#0063dc" d="M0 8c0 2.049 1.663 3.709 3.71 3.709 2.050 0 3.713-1.66 3.713-3.709s-1.662-3.709-3.713-3.709c-2.047 0-3.71 1.66-3.71 3.709zM8.577 8c0 2.049 1.662 3.709 3.711 3.709 2.042 0 3.711-1.66 3.711-3.709s-1.661-3.709-3.709-3.709c-2.050 0-3.713 1.66-3.713 3.709z"></path> <path fill="#0063dc" d="M0 8c0 2.049 1.663 3.709 3.71 3.709 2.050 0 3.713-1.66 3.713-3.709s-1.662-3.709-3.713-3.709c-2.047 0-3.71 1.66-3.71 3.709zM8.577 8c0 2.049 1.662 3.709 3.711 3.709 2.042 0 3.711-1.66 3.711-3.709s-1.661-3.709-3.709-3.709c-2.050 0-3.713 1.66-3.713 3.709z"></path>
</symbol> </symbol>
<symbol id="icon-web.archive.org" viewBox="0 0 17 16">
<path d="M16 15v-1h-1v-6h1v-1h-3v1h1v6h-3v-6h1v-1h-3v1h1v6h-3v-6h1v-1h-3v1h1v6h-3v-6h1v-1h-3v1h1v6h-1v1h-1v1h17v-1h-1z"></path>
<path d="M8 0h1l8 5v1h-17v-1l8-5z"></path>
</symbol>
</svg> </svg>

Before

Width:  |  Height:  |  Size: 22 KiB

After

Width:  |  Height:  |  Size: 22 KiB

View file

@ -13,21 +13,26 @@ from collections import deque
from urllib.parse import urlparse from urllib.parse import urlparse
import settings import settings
import arrow import arrow
from time import sleep
logger = logging.getLogger("wayback") logger = logging.getLogger("wayback")
logger.setLevel(10) logger.setLevel(10)
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
console_handler.setFormatter(formatter) console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
from pprint import pprint from pprint import pprint
RE_FIRST = re.compile(r"^\<(?P<url>[^>]+)\>; rel=\"first memento\"; datetime=\"(?P<datetime>[^\"]+).*$") RE_FIRST = re.compile(
r"^\<(?P<url>[^>]+)\>; rel=\"first memento\"; datetime=\"(?P<datetime>[^\"]+).*$"
)
class FindWaybackURL(object): class FindWaybackURL(object):
def __init__(self, path, category="", redirects=[]): def __init__(self, path, category="", redirects=[]):
self.path = path self.path = path
self.category = category self.category = category
@ -49,39 +54,74 @@ class FindWaybackURL(object):
for domain in domains: for domain in domains:
q[f"http://{domain}/{path}/"] = True q[f"http://{domain}/{path}/"] = True
if self.category in settings.formercategories: if self.category in settings.formercategories:
categories = settings.formercategories[self.category] categories = settings.formercategories[
self.category
]
else: else:
categories = [] categories = []
categories.append(self.category) categories.append(self.category)
for category in categories: for category in categories:
q[f"http://{domain}/{category}/{path}/"] = True q[f"http://{domain}/{category}/{path}/"] = True
q[f"http://{domain}/category/{category}/{path}/"] = True q[
#logger.info("possible urls: %s", json.dumps(list(q.keys()), indent=4, ensure_ascii=False)) f"http://{domain}/category/{category}/{path}/"
] = True
# logger.info("possible urls: %s", json.dumps(list(q.keys()), indent=4, ensure_ascii=False))
return list(q.keys()) return list(q.keys())
def get_first_memento(self, url): def get_first_memento(self, url):
target = f"http://web.archive.org/web/timemap/link/{url}" target = f"http://web.archive.org/web/timemap/link/{url}"
logger.info("requesting %s", url)
mementos = requests.get(target) mementos = requests.get(target)
if not mementos.text: if mementos.status_code == requests.codes.ok:
return None if not len(mementos.text):
logger.debug("empty memento response for %s", target)
for memento in mementos.text.split("\n"): for memento in mementos.text.split("\n"):
m = RE_FIRST.match(memento) m = RE_FIRST.match(memento)
if m: if m:
return settings.nameddict({
'epoch': int(arrow.get(m.group('datetime'), "ddd, DD MMM YYYY HH:mm:ss ZZZ").to("utc").timestamp), r = settings.nameddict(
'url': m.group('url') {
}) "epoch": int(
arrow.get(
m.group("datetime"),
"ddd, DD MMM YYYY HH:mm:ss ZZZ",
)
.to("utc")
.timestamp
),
"url": m.group("url"),
}
)
logger.info("found memento candidate: %s", r)
return r
else:
logger.debug(
"no first memento found at: %s", target
)
else:
logger.warning(
"request failed: %s, status: %s, txt: %s",
mementos,
mementos.status_code,
mementos.text,
)
def run(self): def run(self):
l = self.possible_urls() l = self.possible_urls()
logging.info("running archive.org lookup for %s", self.path) logger.info("running archive.org lookup for %s", self.path)
for url in l: for url in l:
maybe = self.get_first_memento(url) maybe = self.get_first_memento(url)
if maybe: if maybe:
if maybe.epoch < self.epoch: if maybe.epoch < self.epoch:
self.epoch = maybe.epoch self.epoch = maybe.epoch
self.oldest = maybe.url self.oldest = maybe.url
sleep(.500)
if not len(self.oldest): if not len(self.oldest):
logger.error("no memento found for %s", self.path) logger.error("no memento found for %s", self.path)
else: else:
logger.info("\t\toldest found memento for %s: %s :: %s", self.path, str(arrow.get(self.epoch)), self.oldest) logger.info(
"\t\toldest found memento for %s: %s :: %s",
self.path,
str(arrow.get(self.epoch)),
self.oldest,
)