nasg/shared.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# vim: set fileencoding=utf-8 :

__author__ = "Peter Molnar"
__copyright__ = "Copyright 2017, Peter Molnar"
__license__ = "GPLv3"
__version__ = "2.0"
__maintainer__ = "Peter Molnar"
__email__ = "hello@petermolnar.eu"
__status__ = "Production"

"""
    silo archiver module of NASG
    Copyright (C) 2017 Peter Molnar

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software Foundation,
    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
"""

import configparser
import os
import re
import glob
import logging
import subprocess
import json
import sqlite3
import requests
from slugify import slugify
import jinja2


class CMDLine(object):
    def __init__(self, executable):
        self.executable = self._which(executable)
        if self.executable is None:
            raise OSError('No %s found in PATH!' % executable)
            return

    @staticmethod
    def _which(name):
        for d in os.environ['PATH'].split(':'):
            which = glob.glob(os.path.join(d, name), recursive=True)
            if which:
                return which.pop()
        return None


class XRay(CMDLine):
    cmd_prefix = 'chdir("/usr/local/lib/php/xray"); include("vendor/autoload.php"); $xray = new p3k\XRay();'

    def __init__(self, url):
        super().__init__('php')
        self.url = url
        self.target = ''
        self.cmd = (
            self.executable,
            '-r',
            '%s; echo(json_encode($xray->parse("%s")));' % (
                self.cmd_prefix,
                self.url
            )
        )

    def set_receive(self, target):
        self.cmd = (
            self.executable,
            '-r',
            '%s; echo(json_encode($xray->parse("%s")));' % (
                self.cmd_prefix,
                self.url,
                target
            )
        )
        return self

    def set_discover(self):
        self.cmd = (
            self.executable,
            '-r',
            '%s; echo(json_encode($xray->rels("%s")));' % (
                self.cmd_prefix,
                self.url,
            )
        )
        return self

    def parse(self):
        logging.debug('pulling %s with XRay', self.url)
        p = subprocess.Popen(
            self.cmd,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )

        stdout, stderr = p.communicate()
        if stderr:
            logging.error("Error with XRay: %s", stderr)

        return json.loads(stdout.decode('utf-8').strip())


class Pandoc(CMDLine):
    """ Pandoc command line call with piped in- and output """

    def __init__(self, md2html=True):
        super().__init__('pandoc')
        if True == md2html:
            self.i = "markdown+" + "+".join([
                'backtick_code_blocks',
                'auto_identifiers',
                'fenced_code_attributes',
                'definition_lists',
                'grid_tables',
                'pipe_tables',
                'strikeout',
                'superscript',
                'subscript',
                'markdown_in_html_blocks',
                'shortcut_reference_links',
                'autolink_bare_uris',
                'raw_html',
                'link_attributes',
                'header_attributes',
                'footnotes',
            ])
            self.o = 'html5'
        elif 'plain' == md2html:
            self.i = "markdown+" + "+".join([
                'backtick_code_blocks',
                'auto_identifiers',
                'fenced_code_attributes',
                'definition_lists',
                'grid_tables',
                'pipe_tables',
                'strikeout',
                'superscript',
                'subscript',
                'markdown_in_html_blocks',
                'shortcut_reference_links',
                'autolink_bare_uris',
                'raw_html',
                'link_attributes',
                'header_attributes',
                'footnotes',
            ])
            self.o = "plain"
        else:
            self.o = "markdown-" + "-".join([
                'raw_html',
                'native_divs',
                'native_spans',
            ])
            self.i = 'html'

    def convert(self, text):
        cmd = (
            self.executable,
            '-o-',
            '--from=%s' % self.i,
            '--to=%s' % self.o
        )
        logging.debug('converting string with Pandoc')
        p = subprocess.Popen(
            cmd,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )

        stdout, stderr = p.communicate(input=text.encode())
        if stderr:
            logging.error(
                "Error during pandoc covert:\n\t%s\n\t%s",
                cmd,
                stderr
            )
        return stdout.decode('utf-8').strip()


class ExifTool(CMDLine):
    def __init__(self, fpath):
        self.fpath = fpath
        super().__init__('exiftool')

    @staticmethod
    def exifdate2iso(value):
        """ converts and EXIF date string to ISO 8601 format

        :param value: EXIF date (2016:05:01 00:08:24)
        :type arg1: str
        :return: ISO 8601 string with UTC timezone 2016-05-01T00:08:24+0000
        :rtype: str
        """
        if not isinstance(value, str):
            return value
        match = REGEX['exifdate'].match(value)
        if not match:
            return value
        return "%s-%s-%sT%s+0000" % (
            match.group('year'),
            match.group('month'),
            match.group('day'),
            match.group('time')
        )

    def read(self):
        cmd = (
            self.executable,
            '-sort',
            '-json',
            '-MIMEType',
            '-FileType',
            '-FileName',
            '-ModifyDate',
            '-CreateDate',
            '-DateTimeOriginal',
            '-ImageHeight',
            '-ImageWidth',
            '-Aperture',
            '-FOV',
            '-ISO',
            '-FocalLength',
            '-FNumber',
            '-FocalLengthIn35mmFormat',
            '-ExposureTime',
            '-Copyright',
            '-Artist',
            '-Model',
            '-GPSLongitude#',
            '-GPSLatitude#',
            '-LensID',
            '-LensSpec',
            '-Lens',
            '-ReleaseDate',
            '-Description',
            '-Headline',
            '-HierarchicalSubject',
            self.fpath
        )

        logging.debug('reading EXIF from %s', self.fpath)
        p = subprocess.Popen(
            cmd,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )

        stdout, stderr = p.communicate()
        if stderr:
            logging.error("Error reading EXIF:\n\t%s\n\t%s", cmd, stderr)

        exif = json.loads(stdout.decode('utf-8').strip()).pop()
        if 'ReleaseDate' in exif and 'ReleaseTime' in exif:
            exif['DateTimeRelease'] = "%s %s" % (
                exif.get('ReleaseDate'), exif.get('ReleaseTime')[:8])
            del(exif['ReleaseDate'])
            del(exif['ReleaseTime'])

        for k, v in exif.items():
            exif[k] = self.exifdate2iso(v)

        return exif


class BaseDB(object):
    def __init__(self, fpath):
        self.db = sqlite3.connect(fpath)
        self.db.execute('PRAGMA auto_vacuum = INCREMENTAL;')
        self.db.execute('PRAGMA journal_mode = MEMORY;')
        self.db.execute('PRAGMA temp_store = MEMORY;')
        self.db.execute('PRAGMA locking_mode = NORMAL;')
        self.db.execute('PRAGMA synchronous = FULL;')
        self.db.execute('PRAGMA encoding = "UTF-8";')

    def __exit__(self):
        self.finish()

    def finish(self):
        cursor = self.db.cursor()
        cursor.execute('PRAGMA auto_vacuum;')
        self.db.close()


class TokenDB(object):
    def __init__(self, uuid='tokens'):
        self.db = config.get('var', 'tokendb')
        self.tokens = {}
        self.refresh()

    def refresh(self):
        self.tokens = {}
        if os.path.isfile(self.db):
            with open(self.db, 'rt') as f:
                self.tokens = json.loads(f.read())

    def save(self):
        with open(self.db, 'wt') as f:
            f.write(json.dumps(
                self.tokens, indent=4, sort_keys=True
            ))

    def get_token(self, token):
        return self.tokens.get(token, None)

    def get_service(self, service):
        token = self.tokens.get(service, None)
        return token

    def set_service(self, service, tokenid):
        self.tokens.update({
            service: tokenid
        })
        self.save()

    def update_token(self,
                     token,
                     oauth_token_secret=None,
                     access_token=None,
                     access_token_secret=None,
                     verifier=None):

        t = self.tokens.get(token, {})
        if oauth_token_secret:
            t.update({
                'oauth_token_secret': oauth_token_secret
            })
        if access_token:
            t.update({
                'access_token': access_token
            })
        if access_token_secret:
            t.update({
                'access_token_secret': access_token_secret
            })
        if verifier:
            t.update({
                'verifier': verifier
            })

        self.tokens.update({
            token: t
        })
        self.save()

    def clear(self):
        self.tokens = {}
        self.save()

    def clear_service(self, service):
        t = self.tokens.get(service)
        if t:
            del(self.tokens[t])
        del(self.tokens[service])
        self.save()


class SearchDB(BaseDB):
    tmplfile = 'Search.html'

    def __init__(self):
        self.fpath = "%s" % config.get('var', 'searchdb')
        super().__init__(self.fpath)
        cursor = self.db.cursor()
        cursor.execute('''CREATE VIRTUAL TABLE IF NOT EXISTS data USING FTS5(
                id,
                corpus,
                mtime,
                url,
                category,
                title,
                tokenize = 'porter'
            )''')
        self.db.commit()

    def __exit__(self):
        self.finish()

    def finish(self):
        cursor = self.db.cursor()
        cursor.execute('''PRAGMA auto_vacuum;''')
        self.db.close()

    def append(self, id, corpus, mtime, url, category, title):
        mtime = int(mtime)
        logging.debug("adding %s to searchdb", id)
        cursor = self.db.cursor()
        cursor.execute('''DELETE FROM data WHERE id=?''', (id,))
        cursor.execute('''INSERT OR IGNORE INTO data (id, corpus, mtime, url, category, title) VALUES (?,?,?,?,?,?);''', (
            id,
            corpus,
            mtime,
            url,
            category,
            title
        ))
        self.db.commit()

    def is_uptodate(self, fname, mtime):
        mtime = int(mtime)
        ret = {}
        cursor = self.db.cursor()
        cursor.execute('''SELECT mtime
            FROM data
            WHERE id = ? AND mtime = ?''',
                       (fname, mtime)
                       )
        rows = cursor.fetchall()

        if len(rows):
            logging.debug("%s is up to date in searchdb", fname)
            return True

        logging.debug("%s is out of  date in searchdb", fname)
        return False

    def search_by_query(self, query):
        ret = {}
        cursor = self.db.cursor()
        cursor.execute('''SELECT
            id, category, url, title, snippet(data, 1, '', '', '[...]', 24)
            FROM data
            WHERE data MATCH ?
            ORDER BY category, rank;''', (query,))
        rows = cursor.fetchall()
        for r in rows:
            r = {
                'id': r[0],
                'category': r[1],
                'url': r[2],
                'title': r[3],
                'txt': r[4],
            }

            category = r.get('category')
            if category not in ret:
                ret.update({category: {}})

            maybe_fpath = os.path.join(
                config.get('dirs', 'content'),
                category,
                "%s.*" % r.get('id')
            )
            #fpath = glob.glob(maybe_fpath).pop()
            ret.get(category).update({
                r.get('id'): {
                    #'fpath': fpath,
                    'url': r.get('url'),
                    'title': r.get('title'),
                    'txt': r.get('txt')
                }
            })
        return ret

    def cli(self, query):
        results = self.search_by_query(query)
        for c, items in sorted(results.items()):
            print("%s:" % c)
            for fname, data in sorted(items.items()):
                print("  %s" % data.get('fpath'))
                print("  %s" % data.get('url'))
                print("")

    def html(self, query):
        tmplvars = {
            'results': self.search_by_query(query),
            'term': query
        }
        return j2.get_template(self.tmplfile).render(tmplvars)


class WebmentionQueue(BaseDB):
    def __init__(self):
        self.fpath = "%s" % config.get('var', 'webmentiondb')
        super().__init__(self.fpath)
        cursor = self.db.cursor()
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS  `queue` (
                `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
                `timestamp` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
                `source` TEXT NOT NULL,
                `target` TEXT NOT NULL,
                `status` INTEGER NOT NULL DEFAULT 0,
                `mtime` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
            );
        ''')
        self.db.commit()

    def __exit__(self):
        self.finish()

    def finish(self):
        self.db.close()

    def queue(self, source, target):
        cursor = self.db.cursor()
        cursor.execute(
            '''INSERT INTO queue (source,target) VALUES (?,?);''', (
                source,
                target
            )
        )
        r = cursor.lastrowid
        self.db.commit()
        return r

    def get_queued(self, fname=None):
        logging.debug('getting queued webmentions for %s', fname)
        ret = []
        cursor = self.db.cursor()
        cursor.execute(
            '''SELECT * FROM queue WHERE target LIKE ? AND status = 0''',
            ('%' +
             fname +
             '%',
             ))
        rows = cursor.fetchall()
        for r in rows:
            ret.append({
                'id': r[0],
                'dt': r[1],
                'source': r[2],
                'target': r[3],
            })
        return ret

    def entry_done(self, id):
        logging.debug('setting %s webmention to done', id)
        cursor = self.db.cursor()
        cursor.execute("UPDATE queue SET status = 1 where ID=?", (id,))
        self.db.commit()


def __expandconfig():
    c = configparser.ConfigParser(
        interpolation=configparser.ExtendedInterpolation(),
        allow_no_value=True
    )
    c.read('config.ini')
    for s in c.sections():
        for o in c.options(s):
            curr = c.get(s, o)
            if 'photo' == s and 'regex' == o:
                REGEX.update({'photo': re.compile(curr)})
            c.set(s, o, os.path.expanduser(curr))
    return c


def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"):
    """ Used to create short, lowercase slug for a number (an epoch) passed """
    num = int(num)
    return ((num == 0) and numerals[0]) or (
        baseN(
            num // b,
            b,
            numerals
        ).lstrip(numerals[0]) + numerals[num % b]
    )


def slugfname(url):
    return "%s" % slugify(
        re.sub(r"^https?://(?:www)?", "", url),
        only_ascii=True,
        lower=True
    )[:200]


def __setup_sitevars():
    SiteVars = {}
    section = 'site'
    for o in config.options(section):
        SiteVars.update({o: config.get(section, o)})

    # this should be a nice recursive function instead
    # extra site section - nope, because it relies on order
    # and author won't get appended
    for section in config.get('site', 'appendwith').split():
        SiteVars.update({section: {}})
        for o in config.options(section):
            SiteVars[section].update({o: config.get(section, o)})
        if not config.get(section, 'appendwith', fallback=False):
            continue
        # subsections
        for sub in config.get(section, 'appendwith').split():
            SiteVars[section].update({sub: {}})
            for o in config.options(sub):
                SiteVars[section][sub].update({o: config.get(sub, o)})

    return SiteVars


def notify(msg):
    # telegram notification, if set
    if not config.has_section('api_telegram'):
        return

    url = "https://api.telegram.org/bot%s/sendMessage" % (
        config.get('api_telegram', 'api_token')
    )
    data = {
        'chat_id': config.get('api_telegram', 'chat_id'),
        'text': msg
    }
    # fire and forget
    try:
        requests.post(url, data=data)
    except BaseException:
        pass


ARROWFORMAT = {
    'iso': 'YYYY-MM-DDTHH:mm:ssZ',
    'display': 'YYYY-MM-DD HH:mm',
    'rcf': 'ddd, DD MMM YYYY HH:mm:ss Z',
    'twitter': 'ddd MMM DD HH:mm:ss Z YYYY'
}

LLEVEL = {
    'critical': 50,
    'error': 40,
    'warning': 30,
    'info': 20,
    'debug': 10
}

REGEX = {
    'exifdate': re.compile(
        r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+'
        r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$'
    ),
    'cleanurl': re.compile(r"^https?://(?:www)?"),
    'urls': re.compile(
        r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+'
        r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*'
    ),
    'mdimg': re.compile(
        r'(?P<shortcode>\!\[(?P<alt>[^\]]+)\]\((?P<fname>[^\s]+)'
        r'(?:\s[\'\"](?P<title>[^\"\']+)[\'\"])?\)(?:\{(?P<css>[^\}]+)\})?)',
        re.IGNORECASE
    )
}

config = __expandconfig()

j2 = jinja2.Environment(
    loader=jinja2.FileSystemLoader(
        searchpath=config.get('dirs', 'tmpl')
    ),
    lstrip_blocks=True
)

site = __setup_sitevars()