nasg/wayback.py

__author__ = "Peter Molnar"
__copyright__ = "Copyright 2017-2019, Peter Molnar"
__license__ = "apache-2.0"
__maintainer__ = "Peter Molnar"
__email__ = "mail@petermolnar.net"

import re
import json
import os
import logging
import requests
from collections import deque
from urllib.parse import urlparse
import settings
import arrow
from time import sleep

logger = logging.getLogger("wayback")
logger.setLevel(10)

console_handler = logging.StreamHandler()
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

from pprint import pprint

RE_FIRST = re.compile(
    r"^\<(?P<url>[^>]+)\>; rel=\"first memento\"; datetime=\"(?P<datetime>[^\"]+).*$"
)


class FindWaybackURL(object):
    def __init__(self, path, category=""):
        self.path = path
        self.category = category
        self.epoch = int(arrow.utcnow().timestamp)
        self.oldest = ""

    def save_to_archiveorg(self):
        urls = [
            f"{settings.site.url}/{self.path}/",
            f"{settings.site.url}/{self.path}/index.html"
        ]
        for url in urls:
            logger.info("saving %s to archive.org ", url)
            r = requests.get(f"https://web.archive.org/save/{url}")

    def possible_urls(self):
        q = {}
        q[f"http://{settings.site.name}/{self.path}/"] = True
        q[f"http://{settings.site.name}/{self.path}/index.html"] = True

        domains = settings.formerdomains + [settings.site.name]
        for domain in domains:
            q[f"http://{domain}/{self.path}/"] = True
            categories = []
            if self.category in settings.formercategories:
                categories = categories + settings.formercategories[self.category]
            for category in categories:
                q[f"http://{domain}/{category}/{self.path}/"] = True
                q[
                    f"http://{domain}/category/{category}/{self.path}/"
                ] = True
        return list(q.keys())

    def get_first_memento(self, url):
        target = f"http://web.archive.org/web/timemap/link/{url}"
        logger.info("requesting %s", url)
        mementos = requests.get(target)
        if mementos.status_code == requests.codes.ok:
            if not len(mementos.text):
                logger.debug("empty memento response for %s", target)
            for memento in mementos.text.split("\n"):
                m = RE_FIRST.match(memento)
                if m:

                    r = settings.nameddict(
                        {
                            "epoch": int(
                                arrow.get(
                                    m.group("datetime"),
                                    "ddd, DD MMM YYYY HH:mm:ss ZZZ",
                                )
                                .to("utc")
                                .timestamp
                            ),
                            "url": m.group("url"),
                        }
                    )
                    logger.info("found memento candidate: %s", r)
                    return r
                else:
                    logger.debug(
                        "no first memento found at: %s", target
                    )
        else:
            logger.warning(
                "request failed: %s, status: %s, txt: %s",
                mementos,
                mementos.status_code,
                mementos.text,
            )

    def run(self):
        l = self.possible_urls()
        logger.info("running archive.org lookup for %s", self.path)
        for url in l:
            maybe = self.get_first_memento(url)
            if maybe:
                if maybe.epoch < self.epoch:
                    self.epoch = maybe.epoch
                    self.oldest = maybe.url
            sleep(.500)
        if not len(self.oldest):
            logger.error("no memento found for %s", self.path)
            self.save_to_archiveorg()
        else:
            logger.info(
                "\t\toldest found memento for %s: %s :: %s",
                self.path,
                str(arrow.get(self.epoch)),
                self.oldest,
            )
this is a replacement for the removed memento functionality; a tiny wayback machine crawler that tried to look up the earliest saved version of a page 2019-08-14 10:17:20 +01:00			`__author__ = "Peter Molnar"`
			`__copyright__ = "Copyright 2017-2019, Peter Molnar"`
			`__license__ = "apache-2.0"`
			`__maintainer__ = "Peter Molnar"`
			`__email__ = "mail@petermolnar.net"`

			`import re`
			`import json`
			`import os`
			`import logging`
			`import requests`
			`from collections import deque`
			`from urllib.parse import urlparse`
			`import settings`
			`import arrow`
cleanups on wayback functionality 2019-08-14 11:28:01 +01:00			`from time import sleep`
this is a replacement for the removed memento functionality; a tiny wayback machine crawler that tried to look up the earliest saved version of a page 2019-08-14 10:17:20 +01:00
			`logger = logging.getLogger("wayback")`
			`logger.setLevel(10)`

			`console_handler = logging.StreamHandler()`
cleanups on wayback functionality 2019-08-14 11:28:01 +01:00			`formatter = logging.Formatter(`
			`"%(asctime)s - %(name)s - %(levelname)s - %(message)s"`
			`)`
this is a replacement for the removed memento functionality; a tiny wayback machine crawler that tried to look up the earliest saved version of a page 2019-08-14 10:17:20 +01:00			`console_handler.setFormatter(formatter)`
			`logger.addHandler(console_handler)`

			`from pprint import pprint`

cleanups on wayback functionality 2019-08-14 11:28:01 +01:00			`RE_FIRST = re.compile(`
			`r"^\<(?P<url>[^>]+)\>; rel=\"first memento\"; datetime=\"(?P<datetime>[^\"]+).*$"`
			`)`
this is a replacement for the removed memento functionality; a tiny wayback machine crawler that tried to look up the earliest saved version of a page 2019-08-14 10:17:20 +01:00

cleanups on wayback functionality 2019-08-14 11:28:01 +01:00			`class FindWaybackURL(object):`
- added photo layout with CSS flexbox - removed webhook - the functionality now lives in zapier - replaced the flickr logo - arrow is temporarily locked to 0.14.2 because of the ultra annoying warning message for upcoming 0.15 2019-08-23 09:06:26 +01:00			`def __init__(self, path, category=""):`
this is a replacement for the removed memento functionality; a tiny wayback machine crawler that tried to look up the earliest saved version of a page 2019-08-14 10:17:20 +01:00			`self.path = path`
			`self.category = category`
			`self.epoch = int(arrow.utcnow().timestamp)`
			`self.oldest = ""`

save to archive.org if memento lookup fails 2019-08-28 08:52:11 +01:00			`def save_to_archiveorg(self):`
			`urls = [`
			`f"{settings.site.url}/{self.path}/",`
			`f"{settings.site.url}/{self.path}/index.html"`
			`]`
			`for url in urls:`
			`logger.info("saving %s to archive.org ", url)`
			`r = requests.get(f"https://web.archive.org/save/{url}")`

this is a replacement for the removed memento functionality; a tiny wayback machine crawler that tried to look up the earliest saved version of a page 2019-08-14 10:17:20 +01:00			`def possible_urls(self):`
			`q = {}`
- added photo layout with CSS flexbox - removed webhook - the functionality now lives in zapier - replaced the flickr logo - arrow is temporarily locked to 0.14.2 because of the ultra annoying warning message for upcoming 0.15 2019-08-23 09:06:26 +01:00			`q[f"http://{settings.site.name}/{self.path}/"] = True`
			`q[f"http://{settings.site.name}/{self.path}/index.html"] = True`

			`domains = settings.formerdomains + [settings.site.name]`
			`for domain in domains:`
			`q[f"http://{domain}/{self.path}/"] = True`
save to archive.org if memento lookup fails 2019-08-28 08:52:11 +01:00			`categories = []`
- added photo layout with CSS flexbox - removed webhook - the functionality now lives in zapier - replaced the flickr logo - arrow is temporarily locked to 0.14.2 because of the ultra annoying warning message for upcoming 0.15 2019-08-23 09:06:26 +01:00			`if self.category in settings.formercategories:`
			`categories = categories + settings.formercategories[self.category]`
			`for category in categories:`
			`q[f"http://{domain}/{category}/{self.path}/"] = True`
			`q[`
			`f"http://{domain}/category/{category}/{self.path}/"`
			`] = True`
this is a replacement for the removed memento functionality; a tiny wayback machine crawler that tried to look up the earliest saved version of a page 2019-08-14 10:17:20 +01:00			`return list(q.keys())`

			`def get_first_memento(self, url):`
			`target = f"http://web.archive.org/web/timemap/link/{url}"`
cleanups on wayback functionality 2019-08-14 11:28:01 +01:00			`logger.info("requesting %s", url)`
this is a replacement for the removed memento functionality; a tiny wayback machine crawler that tried to look up the earliest saved version of a page 2019-08-14 10:17:20 +01:00			`mementos = requests.get(target)`
cleanups on wayback functionality 2019-08-14 11:28:01 +01:00			`if mementos.status_code == requests.codes.ok:`
			`if not len(mementos.text):`
			`logger.debug("empty memento response for %s", target)`
			`for memento in mementos.text.split("\n"):`
			`m = RE_FIRST.match(memento)`
			`if m:`

			`r = settings.nameddict(`
			`{`
			`"epoch": int(`
			`arrow.get(`
			`m.group("datetime"),`
			`"ddd, DD MMM YYYY HH:mm:ss ZZZ",`
			`)`
			`.to("utc")`
			`.timestamp`
			`),`
			`"url": m.group("url"),`
			`}`
			`)`
			`logger.info("found memento candidate: %s", r)`
			`return r`
			`else:`
			`logger.debug(`
			`"no first memento found at: %s", target`
			`)`
			`else:`
			`logger.warning(`
			`"request failed: %s, status: %s, txt: %s",`
			`mementos,`
			`mementos.status_code,`
			`mementos.text,`
			`)`
this is a replacement for the removed memento functionality; a tiny wayback machine crawler that tried to look up the earliest saved version of a page 2019-08-14 10:17:20 +01:00
			`def run(self):`
			`l = self.possible_urls()`
cleanups on wayback functionality 2019-08-14 11:28:01 +01:00			`logger.info("running archive.org lookup for %s", self.path)`
this is a replacement for the removed memento functionality; a tiny wayback machine crawler that tried to look up the earliest saved version of a page 2019-08-14 10:17:20 +01:00			`for url in l:`
			`maybe = self.get_first_memento(url)`
			`if maybe:`
			`if maybe.epoch < self.epoch:`
			`self.epoch = maybe.epoch`
			`self.oldest = maybe.url`
cleanups on wayback functionality 2019-08-14 11:28:01 +01:00			`sleep(.500)`
this is a replacement for the removed memento functionality; a tiny wayback machine crawler that tried to look up the earliest saved version of a page 2019-08-14 10:17:20 +01:00			`if not len(self.oldest):`
			`logger.error("no memento found for %s", self.path)`
save to archive.org if memento lookup fails 2019-08-28 08:52:11 +01:00			`self.save_to_archiveorg()`
this is a replacement for the removed memento functionality; a tiny wayback machine crawler that tried to look up the earliest saved version of a page 2019-08-14 10:17:20 +01:00			`else:`
cleanups on wayback functionality 2019-08-14 11:28:01 +01:00			`logger.info(`
			`"\t\toldest found memento for %s: %s :: %s",`
			`self.path,`
			`str(arrow.get(self.epoch)),`
			`self.oldest,`
			`)`