nasg/shared.py

import configparser
import os
import re
import glob
import logging
import subprocess
import json
import requests
from urllib.parse import urlparse, urlunparse

from whoosh import fields
from whoosh import analysis
from slugify import slugify

LLEVEL = {
    'critical': 50,
    'error': 40,
    'warning': 30,
    'info': 20,
    'debug': 10
}


def __expandconfig(config):
    """ add the dirs to the config automatically """
    basepath = os.path.expanduser(config.get('common','base'))
    config.set('common', 'basedir', basepath)
    for section in ['source', 'target']:
        for option in config.options(section):
            opt = config.get(section, option)
            config.set(section, "%sdir" % option, os.path.join(basepath,opt))
    config.set('target', 'filesdir', os.path.join(
        config.get('target', 'builddir'),
        config.get('source', 'files'),
    ))
    config.set('target', 'commentsdir', os.path.join(
        config.get('target', 'builddir'),
        config.get('site', 'commentspath'),
    ))
    return config


def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"):
    """ Used to create short, lowercase slug for a number (an epoch) passed """
    num = int(num)
    return ((num == 0) and numerals[0]) or (
        baseN(
            num // b,
            b,
            numerals
        ).lstrip(numerals[0]) + numerals[num % b]
    )

def slugfname(url):
    return "%s" % slugify(
        re.sub(r"^https?://(?:www)?", "", url),
        only_ascii=True,
        lower=True
    )[:200]

ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ'
STRFISO = '%Y-%m-%dT%H:%M:%S%z'

URLREGEX = re.compile(
    r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+'
    r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*'
)

EXIFREXEG = re.compile(
    r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+'
    r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$'
)

MDIMGREGEX = re.compile(
    r'(!\[(.*)\]\((?:\/(?:files|cache)'
    r'(?:\/[0-9]{4}\/[0-9]{2})?\/(.*\.(?:jpe?g|png|gif)))'
    r'(?:\s+[\'\"]?(.*?)[\'\"]?)?\)(?:\{(.*?)\})?)'
, re.IGNORECASE)

schema = fields.Schema(
    url=fields.ID(
        stored=True,
        unique=True
    ),
    category=fields.TEXT(
        stored=True,
    ),
    date=fields.DATETIME(
        stored=True,
        sortable=True
    ),
    title=fields.TEXT(
        stored=True,
        analyzer=analysis.FancyAnalyzer()
    ),
    weight=fields.NUMERIC(
        sortable=True
    ),
    img=fields.TEXT(
        stored=True
    ),
    content=fields.TEXT(
        stored=True,
        analyzer=analysis.FancyAnalyzer()
    ),
    fuzzy=fields.NGRAMWORDS(
        tokenizer=analysis.NgramTokenizer(4)
    ),
    mtime=fields.NUMERIC(
        stored=True
    )
    #slug=fields.NGRAMWORDS(
        #tokenizer=analysis.NgramTokenizer(4)
    #),
    #reactions=fields.NGRAMWORDS(
        #tokenizer=analysis.NgramTokenizer(4)
    #),
    #tags=fields.TEXT(
        #stored=False,
        #analyzer=analysis.KeywordAnalyzer(
            #lowercase=True,
            #commas=True
        #),
    #),
)

config = configparser.ConfigParser(
    interpolation=configparser.ExtendedInterpolation(),
    allow_no_value=True
)
config.read('config.ini')
config = __expandconfig(config)

class CMDLine(object):
    def __init__(self, executable):
        self.executable = self._which(executable)
        if self.executable is None:
            raise OSError('No %s found in PATH!' % executable)
            return


    @staticmethod
    def _which(name):
        for d in os.environ['PATH'].split(':'):
            which = glob.glob(os.path.join(d, name), recursive=True)
            if which:
                return which.pop()
        return None


    def __enter__(self):
        self.process = subprocess.Popen(
            [self.executable, "-stay_open", "True",  "-@", "-"],
            universal_newlines=True,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )
        return self


    def  __exit__(self, exc_type, exc_value, traceback):
        self.process.stdin.write("-stay_open\nFalse\n")
        self.process.stdin.flush()


    def execute(self, *args):
        args = args + ("-execute\n",)
        self.process.stdin.write(str.join("\n", args))
        self.process.stdin.flush()
        output = ""
        fd = self.process.stdout.fileno()
        while not output.endswith(self.sentinel):
            output += os.read(fd, 4096).decode('utf-8', errors='ignore')
        return output[:-len(self.sentinel)]


class Pandoc(CMDLine):
    """ Pandoc command line call with piped in- and output """

    def __init__(self, md2html=True):
        super().__init__('pandoc')
        if True == md2html:
            self.i = "markdown+" + "+".join([
                'backtick_code_blocks',
                'auto_identifiers',
                'fenced_code_attributes',
                'definition_lists',
                'grid_tables',
                'pipe_tables',
                'strikeout',
                'superscript',
                'subscript',
                'markdown_in_html_blocks',
                'shortcut_reference_links',
                'autolink_bare_uris',
                'raw_html',
                'link_attributes',
                'header_attributes',
                'footnotes',
            ])
            self.o = 'html5'
        elif 'plain' == md2html:
            self.i = "markdown+" + "+".join([
                'backtick_code_blocks',
                'auto_identifiers',
                'fenced_code_attributes',
                'definition_lists',
                'grid_tables',
                'pipe_tables',
                'strikeout',
                'superscript',
                'subscript',
                'markdown_in_html_blocks',
                'shortcut_reference_links',
                'autolink_bare_uris',
                'raw_html',
                'link_attributes',
                'header_attributes',
                'footnotes',
            ])
            self.o = "plain"
        else:
            self.o = "markdown-" + "-".join([
                'raw_html',
                'native_divs',
                'native_spans',
            ])
            self.i = 'html'


    def convert(self, text):
        cmd = (
            self.executable,
            '-o-',
            '--from=%s' % self.i,
            '--to=%s' % self.o
        )
        logging.debug('converting string with Pandoc')
        p = subprocess.Popen(
            cmd,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )

        stdout, stderr = p.communicate(input=text.encode())
        if stderr:
            logging.error(
                "Error during pandoc covert:\n\t%s\n\t%s",
                cmd,
                stderr
            )
        return stdout.decode('utf-8').strip()


class HeadlessChromium(CMDLine):
    def __init__(self, url):
        super().__init__('chromium-browser')
        self.url = url

    def get(self):
        cmd = (
            self.executable,
            '--headless',
            '--disable-gpu',
            '--disable-preconnect',
            '--dump-dom',
            '--timeout 60',
            '--save-page-as-mhtml',
            "%s" % self.url
        )
        logging.debug('getting URL %s with headless chrome', self.url)
        p = subprocess.Popen(
            cmd,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )

        stdout, stderr = p.communicate()
        if stderr:
            logging.error(
                "Error getting URL:\n\t%s\n\t%s",
                cmd,
                stderr
            )
        return stdout.decode('utf-8').strip()


class wget(CMDLine):
    def __init__(self, url, dirname=None):
        super().__init__('wget')
        self.url = url
        self.slug = dirname or slugfname(self.url)
        self.saveto = os.path.join(
            config.get('source', 'offlinecopiesdir'),
            self.slug
        )

    def archive(self):
        cmd = (
            self.executable,
            '-e',
            'robots=off',
            '--timeout=360',
            '--no-clobber',
            '--no-directories',
            '--adjust-extension',
            '--span-hosts',
            '--wait=1',
            '--random-wait',
            '--convert-links',
            #'--backup-converted',
            '--page-requisites',
            '--directory-prefix=%s' % self.saveto,
            "%s" % self.url
        )
        logging.debug('getting URL %s with wget', self.url)
        p = subprocess.Popen(
            cmd,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )

        stdout, stderr = p.communicate()
        if stderr:
            logging.error(
                "Error getting URL:\n\t%s\n\t%s",
                cmd,
                stderr
            )
        return stdout.decode('utf-8').strip()

def find_realurl(url):
    headers = requests.utils.default_headers()
    headers.update({
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
    })

    try:
        r = requests.get(
            url,
            allow_redirects=True,
            timeout=60,
            headers=headers
        )
    except Exception as e:
        logging.error('getting real url failed: %s', e)
        return (None, 400)

    finalurl = list(urlparse(r.url))
    finalurl[4] = '&'.join(
        [x for x in finalurl[4].split('&') if not x.startswith('utm_')])
    finalurl = urlunparse(finalurl)

    return (finalurl, r.status_code)

def find_archiveorgurl(url):
    url, status = find_realurl(url)
    if status == requests.codes.ok:
        return url

    try:
        a = requests.get(
            "http://archive.org/wayback/available?url=%s" % url,
        )
    except Exception as e:
        logging.error('Failed to fetch archive.org availability for %s' % url)
        return None

    if not a:
        logging.error('empty archive.org availability for %s' % url)
        return None

    try:
        a = json.loads(a.text)
        aurl = a.get(
            'archived_snapshots', {}
        ).get(
            'closest', {}
        ).get(
            'url', None
        )
        if aurl:
            logging.debug("found %s in archive.org for %s", aurl, url)
            return aurl
    except Exception as e:
        logging.error("archive.org parsing failed: %s", e)

    return None
version 2.0 2017-05-23 11:14:47 +01:00			`import configparser`
			`import os`
working search and webmentions receiver 2017-05-26 10:14:24 +01:00			`import re`
			`import glob`
			`import logging`
			`import subprocess`
updated 2017-06-28 12:20:26 +01:00			`import json`
offlinecopy moved to wget instead of python requests 2017-09-04 13:53:59 +01:00			`import requests`
			`from urllib.parse import urlparse, urlunparse`
updated 2017-06-28 12:20:26 +01:00
version 2.0 2017-05-23 11:14:47 +01:00			`from whoosh import fields`
			`from whoosh import analysis`
updated 2017-06-28 12:20:26 +01:00			`from slugify import slugify`
code spacing cleanup + adding tagmyloc 2017-06-12 15:40:30 +01:00
merging loglevel array to shared 2017-07-05 22:09:06 +01:00			`LLEVEL = {`
			`'critical': 50,`
			`'error': 40,`
			`'warning': 30,`
			`'info': 20,`
			`'debug': 10`
			`}`


version 2.0 2017-05-23 11:14:47 +01:00			`def __expandconfig(config):`
			`""" add the dirs to the config automatically """`
			`basepath = os.path.expanduser(config.get('common','base'))`
			`config.set('common', 'basedir', basepath)`
			`for section in ['source', 'target']:`
			`for option in config.options(section):`
			`opt = config.get(section, option)`
			`config.set(section, "%sdir" % option, os.path.join(basepath,opt))`
			`config.set('target', 'filesdir', os.path.join(`
			`config.get('target', 'builddir'),`
			`config.get('source', 'files'),`
			`))`
making things faster, cleaning up bits and fixing problems with comments 2017-06-04 11:38:36 +01:00			`config.set('target', 'commentsdir', os.path.join(`
			`config.get('target', 'builddir'),`
			`config.get('site', 'commentspath'),`
			`))`
version 2.0 2017-05-23 11:14:47 +01:00			`return config`

code spacing cleanup + adding tagmyloc 2017-06-12 15:40:30 +01:00
adding pesos code and cleanups 2017-06-12 15:17:29 +01:00			`def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"):`
code spacing cleanup + adding tagmyloc 2017-06-12 15:40:30 +01:00			`""" Used to create short, lowercase slug for a number (an epoch) passed """`
adding pesos code and cleanups 2017-06-12 15:17:29 +01:00			`num = int(num)`
			`return ((num == 0) and numerals[0]) or (`
			`baseN(`
			`num // b,`
			`b,`
			`numerals`
			`).lstrip(numerals[0]) + numerals[num % b]`
			`)`

updated 2017-06-28 12:20:26 +01:00			`def slugfname(url):`
nasg class cleanup, webmention hardening 2017-07-26 11:23:06 +01:00			`return "%s" % slugify(`
			`re.sub(r"^https?://(?:www)?", "", url),`
			`only_ascii=True,`
			`lower=True`
			`)[:200]`
adding pesos code and cleanups 2017-06-12 15:17:29 +01:00
working search and webmentions receiver 2017-05-26 10:14:24 +01:00			`ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ'`
better search and webmention sending added 2017-05-31 13:53:47 +01:00			`STRFISO = '%Y-%m-%dT%H:%M:%S%z'`
working search and webmentions receiver 2017-05-26 10:14:24 +01:00
version 2.0 2017-05-23 11:14:47 +01:00			`URLREGEX = re.compile(`
			`r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+'`
			`r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*'`
			`)`

			`EXIFREXEG = re.compile(`
			`r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+'`
			`r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$'`
			`)`

			`MDIMGREGEX = re.compile(`
			`r'(!\[(.*)\]\((?:\/(?:files\|cache)'`
			`r'(?:\/[0-9]{4}\/[0-9]{2})?\/(.*\.(?:jpe?g\|png\|gif)))'`
			`r'(?:\s+[\'\"]?(.?)[\'\"]?)?\)(?:\{(.?)\})?)'`
			`, re.IGNORECASE)`

			`schema = fields.Schema(`
			`url=fields.ID(`
			`stored=True,`
			`unique=True`
			`),`
replacing licence with GPLv3 because Stallman was right; refactored search; added taxonomy indexes 2017-07-17 14:21:28 +01:00			`category=fields.TEXT(`
version 2.0 2017-05-23 11:14:47 +01:00			`stored=True,`
			`),`
			`date=fields.DATETIME(`
			`stored=True,`
			`sortable=True`
			`),`
replacing licence with GPLv3 because Stallman was right; refactored search; added taxonomy indexes 2017-07-17 14:21:28 +01:00			`title=fields.TEXT(`
version 2.0 2017-05-23 11:14:47 +01:00			`stored=True,`
better search and webmention sending added 2017-05-31 13:53:47 +01:00			`analyzer=analysis.FancyAnalyzer()`
version 2.0 2017-05-23 11:14:47 +01:00			`),`
			`weight=fields.NUMERIC(`
			`sortable=True`
			`),`
			`img=fields.TEXT(`
			`stored=True`
better indexer which only updates a document if it's mtime has changed 2017-05-26 14:52:30 +01:00			`),`
replacing licence with GPLv3 because Stallman was right; refactored search; added taxonomy indexes 2017-07-17 14:21:28 +01:00			`content=fields.TEXT(`
			`stored=True,`
			`analyzer=analysis.FancyAnalyzer()`
			`),`
			`fuzzy=fields.NGRAMWORDS(`
			`tokenizer=analysis.NgramTokenizer(4)`
			`),`
better indexer which only updates a document if it's mtime has changed 2017-05-26 14:52:30 +01:00			`mtime=fields.NUMERIC(`
			`stored=True`
version 2.0 2017-05-23 11:14:47 +01:00			`)`
replacing licence with GPLv3 because Stallman was right; refactored search; added taxonomy indexes 2017-07-17 14:21:28 +01:00			`#slug=fields.NGRAMWORDS(`
			`#tokenizer=analysis.NgramTokenizer(4)`
			`#),`
			`#reactions=fields.NGRAMWORDS(`
			`#tokenizer=analysis.NgramTokenizer(4)`
			`#),`
			`#tags=fields.TEXT(`
			`#stored=False,`
			`#analyzer=analysis.KeywordAnalyzer(`
			`#lowercase=True,`
			`#commas=True`
			`#),`
			`#),`
version 2.0 2017-05-23 11:14:47 +01:00			`)`

			`config = configparser.ConfigParser(`
			`interpolation=configparser.ExtendedInterpolation(),`
			`allow_no_value=True`
			`)`
			`config.read('config.ini')`
			`config = __expandconfig(config)`
working search and webmentions receiver 2017-05-26 10:14:24 +01:00
			`class CMDLine(object):`
			`def __init__(self, executable):`
			`self.executable = self._which(executable)`
			`if self.executable is None:`
			`raise OSError('No %s found in PATH!' % executable)`
			`return`

code spacing cleanup + adding tagmyloc 2017-06-12 15:40:30 +01:00
working search and webmentions receiver 2017-05-26 10:14:24 +01:00			`@staticmethod`
			`def _which(name):`
			`for d in os.environ['PATH'].split(':'):`
			`which = glob.glob(os.path.join(d, name), recursive=True)`
			`if which:`
			`return which.pop()`
			`return None`

code spacing cleanup + adding tagmyloc 2017-06-12 15:40:30 +01:00
working search and webmentions receiver 2017-05-26 10:14:24 +01:00			`def __enter__(self):`
			`self.process = subprocess.Popen(`
			`[self.executable, "-stay_open", "True", "-@", "-"],`
			`universal_newlines=True,`
fixed things 2017-06-02 11:19:55 +01:00			`stdin=subprocess.PIPE,`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE`
			`)`
working search and webmentions receiver 2017-05-26 10:14:24 +01:00			`return self`

code spacing cleanup + adding tagmyloc 2017-06-12 15:40:30 +01:00
working search and webmentions receiver 2017-05-26 10:14:24 +01:00			`def __exit__(self, exc_type, exc_value, traceback):`
			`self.process.stdin.write("-stay_open\nFalse\n")`
			`self.process.stdin.flush()`

code spacing cleanup + adding tagmyloc 2017-06-12 15:40:30 +01:00
working search and webmentions receiver 2017-05-26 10:14:24 +01:00			`def execute(self, *args):`
			`args = args + ("-execute\n",)`
			`self.process.stdin.write(str.join("\n", args))`
			`self.process.stdin.flush()`
			`output = ""`
			`fd = self.process.stdout.fileno()`
			`while not output.endswith(self.sentinel):`
			`output += os.read(fd, 4096).decode('utf-8', errors='ignore')`
			`return output[:-len(self.sentinel)]`

code spacing cleanup + adding tagmyloc 2017-06-12 15:40:30 +01:00
working search and webmentions receiver 2017-05-26 10:14:24 +01:00			`class Pandoc(CMDLine):`
better search and webmention sending added 2017-05-31 13:53:47 +01:00			`""" Pandoc command line call with piped in- and output """`
code spacing cleanup + adding tagmyloc 2017-06-12 15:40:30 +01:00
working search and webmentions receiver 2017-05-26 10:14:24 +01:00			`def __init__(self, md2html=True):`
			`super().__init__('pandoc')`
replacing licence with GPLv3 because Stallman was right; refactored search; added taxonomy indexes 2017-07-17 14:21:28 +01:00			`if True == md2html:`
working search and webmentions receiver 2017-05-26 10:14:24 +01:00			`self.i = "markdown+" + "+".join([`
			`'backtick_code_blocks',`
			`'auto_identifiers',`
			`'fenced_code_attributes',`
			`'definition_lists',`
			`'grid_tables',`
			`'pipe_tables',`
			`'strikeout',`
			`'superscript',`
			`'subscript',`
			`'markdown_in_html_blocks',`
			`'shortcut_reference_links',`
			`'autolink_bare_uris',`
			`'raw_html',`
			`'link_attributes',`
			`'header_attributes',`
			`'footnotes',`
			`])`
			`self.o = 'html5'`
replacing licence with GPLv3 because Stallman was right; refactored search; added taxonomy indexes 2017-07-17 14:21:28 +01:00			`elif 'plain' == md2html:`
			`self.i = "markdown+" + "+".join([`
			`'backtick_code_blocks',`
			`'auto_identifiers',`
			`'fenced_code_attributes',`
			`'definition_lists',`
			`'grid_tables',`
			`'pipe_tables',`
			`'strikeout',`
			`'superscript',`
			`'subscript',`
			`'markdown_in_html_blocks',`
			`'shortcut_reference_links',`
			`'autolink_bare_uris',`
			`'raw_html',`
			`'link_attributes',`
			`'header_attributes',`
			`'footnotes',`
			`])`
			`self.o = "plain"`
working search and webmentions receiver 2017-05-26 10:14:24 +01:00			`else:`
			`self.o = "markdown-" + "-".join([`
			`'raw_html',`
			`'native_divs',`
			`'native_spans',`
			`])`
			`self.i = 'html'`

code spacing cleanup + adding tagmyloc 2017-06-12 15:40:30 +01:00
working search and webmentions receiver 2017-05-26 10:14:24 +01:00			`def convert(self, text):`
			`cmd = (`
			`self.executable,`
			`'-o-',`
			`'--from=%s' % self.i,`
			`'--to=%s' % self.o`
			`)`
better search and webmention sending added 2017-05-31 13:53:47 +01:00			`logging.debug('converting string with Pandoc')`
working search and webmentions receiver 2017-05-26 10:14:24 +01:00			`p = subprocess.Popen(`
			`cmd,`
			`stdin=subprocess.PIPE,`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE,`
			`)`

			`stdout, stderr = p.communicate(input=text.encode())`
			`if stderr:`
better search and webmention sending added 2017-05-31 13:53:47 +01:00			`logging.error(`
			`"Error during pandoc covert:\n\t%s\n\t%s",`
			`cmd,`
			`stderr`
			`)`
working search and webmentions receiver 2017-05-26 10:14:24 +01:00			`return stdout.decode('utf-8').strip()`
offlinecopy moved to wget instead of python requests 2017-09-04 13:53:59 +01:00

			`class HeadlessChromium(CMDLine):`
			`def __init__(self, url):`
			`super().__init__('chromium-browser')`
			`self.url = url`

			`def get(self):`
			`cmd = (`
			`self.executable,`
			`'--headless',`
			`'--disable-gpu',`
			`'--disable-preconnect',`
			`'--dump-dom',`
			`'--timeout 60',`
			`'--save-page-as-mhtml',`
			`"%s" % self.url`
			`)`
			`logging.debug('getting URL %s with headless chrome', self.url)`
			`p = subprocess.Popen(`
			`cmd,`
			`stdin=subprocess.PIPE,`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE,`
			`)`

			`stdout, stderr = p.communicate()`
			`if stderr:`
			`logging.error(`
			`"Error getting URL:\n\t%s\n\t%s",`
			`cmd,`
			`stderr`
			`)`
			`return stdout.decode('utf-8').strip()`


			`class wget(CMDLine):`
			`def __init__(self, url, dirname=None):`
			`super().__init__('wget')`
			`self.url = url`
			`self.slug = dirname or slugfname(self.url)`
			`self.saveto = os.path.join(`
			`config.get('source', 'offlinecopiesdir'),`
			`self.slug`
			`)`

			`def archive(self):`
			`cmd = (`
			`self.executable,`
			`'-e',`
			`'robots=off',`
			`'--timeout=360',`
			`'--no-clobber',`
			`'--no-directories',`
			`'--adjust-extension',`
			`'--span-hosts',`
			`'--wait=1',`
			`'--random-wait',`
			`'--convert-links',`
			`#'--backup-converted',`
			`'--page-requisites',`
			`'--directory-prefix=%s' % self.saveto,`
			`"%s" % self.url`
			`)`
			`logging.debug('getting URL %s with wget', self.url)`
			`p = subprocess.Popen(`
			`cmd,`
			`stdin=subprocess.PIPE,`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE,`
			`)`

			`stdout, stderr = p.communicate()`
			`if stderr:`
			`logging.error(`
			`"Error getting URL:\n\t%s\n\t%s",`
			`cmd,`
			`stderr`
			`)`
			`return stdout.decode('utf-8').strip()`

			`def find_realurl(url):`
			`headers = requests.utils.default_headers()`
			`headers.update({`
			`'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',`
			`})`

			`try:`
			`r = requests.get(`
			`url,`
			`allow_redirects=True,`
			`timeout=60,`
			`headers=headers`
			`)`
			`except Exception as e:`
			`logging.error('getting real url failed: %s', e)`
			`return (None, 400)`

			`finalurl = list(urlparse(r.url))`
			`finalurl[4] = '&'.join(`
			`[x for x in finalurl[4].split('&') if not x.startswith('utm_')])`
			`finalurl = urlunparse(finalurl)`

			`return (finalurl, r.status_code)`

			`def find_archiveorgurl(url):`
			`url, status = find_realurl(url)`
			`if status == requests.codes.ok:`
			`return url`

			`try:`
			`a = requests.get(`
			`"http://archive.org/wayback/available?url=%s" % url,`
			`)`
			`except Exception as e:`
			`logging.error('Failed to fetch archive.org availability for %s' % url)`
			`return None`

			`if not a:`
			`logging.error('empty archive.org availability for %s' % url)`
			`return None`

			`try:`
			`a = json.loads(a.text)`
			`aurl = a.get(`
			`'archived_snapshots', {}`
			`).get(`
			`'closest', {}`
			`).get(`
			`'url', None`
			`)`
			`if aurl:`
			`logging.debug("found %s in archive.org for %s", aurl, url)`
			`return aurl`
			`except Exception as e:`
			`logging.error("archive.org parsing failed: %s", e)`

			`return None`