final commit for offlinecopies, for historical reference, before deleting it
Peter Molnar hello@petermolnar.eu
Wed, 18 Oct 2017 20:35:52 +0100
1 files changed,
114 insertions(+),
3 deletions(-)
jump to
M
offlinecopies.py
→
offlinecopies.py
@@ -1,8 +1,12 @@
import glob -import shared import os import logging +import json import frontmatter +import requests +from urllib.parse import urlparse, urlunparse +import shared + # remove the rest of the potential loggers while len(logging.root.handlers) > 0:@@ -14,7 +18,114 @@ level=10,
format='%(asctime)s - %(levelname)s - %(message)s' ) -bookmarks = glob.glob('/web/petermolnar.net/petermolnar.net/content/bookmark/*.md') + +def find_realurl(url): + headers = requests.utils.default_headers() + headers.update({ + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', + }) + + try: + r = requests.get( + url, + allow_redirects=True, + timeout=60, + headers=headers + ) + except Exception as e: + logging.error('getting real url failed: %s', e) + return (None, 400) + + finalurl = list(urlparse(r.url)) + finalurl[4] = '&'.join( + [x for x in finalurl[4].split('&') if not x.startswith('utm_')]) + finalurl = urlunparse(finalurl) + + return (finalurl, r.status_code) + +def find_archiveorgurl(url): + url, status = find_realurl(url) + if status == requests.codes.ok: + return url + + try: + a = requests.get( + "http://archive.org/wayback/available?url=%s" % url, + ) + except Exception as e: + logging.error('Failed to fetch archive.org availability for %s' % url) + return None + + if not a: + logging.error('empty archive.org availability for %s' % url) + return None + + try: + a = json.loads(a.text) + aurl = a.get( + 'archived_snapshots', {} + ).get( + 'closest', {} + ).get( + 'url', None + ) + if aurl: + logging.debug("found %s in archive.org for %s", aurl, url) + return aurl + except Exception as e: + logging.error("archive.org parsing failed: %s", e) + + return None + + +class wget(shared.CMDLine): + def __init__(self, url, dirname=None): + super().__init__('wget') + self.url = url + self.slug = dirname or slugfname(self.url) + self.saveto = os.path.join( + config.get('source', 'offlinecopiesdir'), + self.slug + ) + + def archive(self): + cmd = ( + self.executable, + '-e', + 'robots=off', + '--timeout=360', + '--no-clobber', + '--no-directories', + '--adjust-extension', + '--span-hosts', + '--wait=1', + '--random-wait', + '--convert-links', + #'--backup-converted', + '--page-requisites', + '--directory-prefix=%s' % self.saveto, + "%s" % self.url + ) + logging.debug('getting URL %s with wget', self.url) + p = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + stdout, stderr = p.communicate() + if stderr: + logging.error( + "Error getting URL:\n\t%s\n\t%s", + cmd, + stderr + ) + return stdout.decode('utf-8').strip() + + + +bookmarks = glob.glob(shared.config.get('dynamic', 'bookmarks'), '*.md') bm = {} for b in bookmarks: with open(b, 'rt') as f:@@ -35,7 +146,7 @@ if os.path.isdir(p):
continue trueurl = shared.find_archiveorgurl(url) - w = shared.wget(trueurl, dirname=f) + w = wget(trueurl, dirname=f) w.archive() # this is to skip the failed ones next time