petermolnar's repositories — nasg (a990dc27ae07970af67a5bcc3fe09a5dcc6646a7): offlinecopies.py

offlinecopies.py (view raw)
import glob
import os
import logging
import json
import frontmatter
import requests
from urllib.parse import urlparse, urlunparse
import shared


# remove the rest of the potential loggers
while len(logging.root.handlers) > 0:
    logging.root.removeHandler(logging.root.handlers[-1])

# --- set loglevel
logging.basicConfig(
    level=10,
    format='%(asctime)s - %(levelname)s - %(message)s'
)


def find_realurl(url):
    headers = requests.utils.default_headers()
    headers.update({
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
    })

    try:
        r = requests.get(
            url,
            allow_redirects=True,
            timeout=60,
            headers=headers
        )
    except Exception as e:
        logging.error('getting real url failed: %s', e)
        return (None, 400)

    finalurl = list(urlparse(r.url))
    finalurl[4] = '&'.join(
        [x for x in finalurl[4].split('&') if not x.startswith('utm_')])
    finalurl = urlunparse(finalurl)

    return (finalurl, r.status_code)

def find_archiveorgurl(url):
    url, status = find_realurl(url)
    if status == requests.codes.ok:
        return url

    try:
        a = requests.get(
            "http://archive.org/wayback/available?url=%s" % url,
        )
    except Exception as e:
        logging.error('Failed to fetch archive.org availability for %s' % url)
        return None

    if not a:
        logging.error('empty archive.org availability for %s' % url)
        return None

    try:
        a = json.loads(a.text)
        aurl = a.get(
            'archived_snapshots', {}
        ).get(
            'closest', {}
        ).get(
            'url', None
        )
        if aurl:
            logging.debug("found %s in archive.org for %s", aurl, url)
            return aurl
    except Exception as e:
        logging.error("archive.org parsing failed: %s", e)

    return None


class wget(shared.CMDLine):
    def __init__(self, url, dirname=None):
        super().__init__('wget')
        self.url = url
        self.slug = dirname or slugfname(self.url)
        self.saveto = os.path.join(
            config.get('source', 'offlinecopiesdir'),
            self.slug
        )

    def archive(self):
        cmd = (
            self.executable,
            '-e',
            'robots=off',
            '--timeout=360',
            '--no-clobber',
            '--no-directories',
            '--adjust-extension',
            '--span-hosts',
            '--wait=1',
            '--random-wait',
            '--convert-links',
            #'--backup-converted',
            '--page-requisites',
            '--directory-prefix=%s' % self.saveto,
            "%s" % self.url
        )
        logging.debug('getting URL %s with wget', self.url)
        p = subprocess.Popen(
            cmd,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )

        stdout, stderr = p.communicate()
        if stderr:
            logging.error(
                "Error getting URL:\n\t%s\n\t%s",
                cmd,
                stderr
            )
        return stdout.decode('utf-8').strip()



bookmarks = glob.glob(shared.config.get('dynamic', 'bookmarks'), '*.md')
bm = {}
for b in bookmarks:
    with open(b, 'rt') as f:
        fm = frontmatter.loads(f.read())
        if not fm.metadata.get('bookmark-of'):
            continue
        bm[b] = fm

for fname, fm in bm.items():
    logging.info('dealing with %s', fname)
    url = fm.metadata.get('bookmark-of')
    f, ext = os.path.splitext(os.path.basename(fname))
    p = os.path.join(
        shared.config.get('source', 'offlinecopiesdir'),
        f
    )
    if os.path.isdir(p):
        continue

    trueurl = shared.find_archiveorgurl(url)
    w = wget(trueurl, dirname=f)
    w.archive()

    # this is to skip the failed ones next time
    if not os.path.isdir(p):
        os.mkdir(p)