2.0-alpha1: tags dropped, favs dropped, bookmarks dropped, reposts dropped, better async rendering; TODO comments, websub pings, webmentions

2017-10-27 10:29:33 +01:00 · 2017-10-27 10:29:33 +01:00 · 4a699ef9f5
commit 4a699ef9f5
parent 112448cf92
7 changed files with 1495 additions and 2234 deletions
--- a/README.md
+++ b/README.md
@ -1,8 +1,166 @@
-# NASG: Not Another Statig Generator...
+# NASG (Not Another Statig Generator)
-So I ended up writing my static generator and this is (most) of the code for it.
+This is a tiny static site generator, written in Python, to scratch my own itches.
 It is most probably not suitable for anyone else.
-Don't expect anything fancy and please be aware that my Python Fu has much to learn.
+## Why not [insert static generator here]?
-I've written about the generic ideas and approaches here in my
+- DRY -Don't Repeat Yourself - is good, so instead of sidefiles for images, I'm using XMP metadata, which most of the ones availabe don't handle well;
-[Going Static](https://petermolnar.net/going-static/) entry.
+- writing a proper plugin to existing generators - Pelican, Nicola, etc - might have taken longer and I wanted to extend my Python knowledge
 - I wanted to use the best available utilities for some tasks, like `Pandoc` and  `exiftool` instead of Python libraries trying to achive the same
 - I needed to handle webmentions and comments
 Don't expect anything fancy: my Python Fu has much to learn.
 ## How content is organized
 The directory structure of the "source" is something like this:
 ```
 ├── content
 │   ├── category1 (containing YAML + MD files)
 │   ├── category2 (containing YAML + MD files)
 │   ├── photo (containing jpg files)
 │   ├── _category_excluded_from_listing_1 (containing YAML + MD files)
 ├── files
 │   ├── image (my own pictures)
 │   ├── photo -> ../content/photo
 │   └── pic (random images)
 ├── nasg
 │   ├── archive.py
 │   ├── config.ini
 │   ├── db.py
 │   ├── LICENSE
 │   ├── nasg.py
 │   ├── README.md
 │   ├── requirements.txt
 │   ├── router.py
 │   ├── shared.py
 │   └── templates
 ├── static
 │   ├── favicon.ico
 │   ├── favicon.png
 │   └── pgp.asc
 └── var
    ├── gone.tsv
    ├── redirects.tsv
    ├── s.sqlite
    ├── tokens.json
    └── webmention.sqlite
 ```
 Content files can be in either YAML and Markdown, with `.md` extension, or JPG with metadata, with `.jpg` extension.
 Inline images in the content are checked against all subdirectories in `files` ; they get their EXIF read and displayed as well if they match the regex in the configuration for the Artist and/or Copyright EXIF fields.
 `gone.tsv` is a simple list of URIs that should return a `410 Gone` message while `redirect.tsv` is a tab separated file of `from to` entries that should be `301` redirected. These go into a magic.php file, so if the host supports executing PHP, it will take care of this.
 ## Output
 `nasg.py` generates a `build` directory which will have an directory per entry, with an `index.html`, so urls can be `https://domain.com/filename/`.
 Categories are rendered into `category/category_name`. Pagination is under `category/category_name/page/X`. They include a feed as well, `category/category_name/feed`, in form if an `index.atom` ATOM feed.
 ## Webserver configuration
 A minimal nginx configuration for the virtualhost:
 ```
 # --- Virtual Host ---
 upstream {{ domain }} {
    server unix:/var/run/php/{{ domain }}.sock;
 }
 server {
    listen 80;
    server_name .{{ domain }};
    rewrite ^ https://$server_name$request_uri redirect;
    access_log  /dev/null;
    error_log /dev/null;
 }
 server {
    listen 443 ssl http2;
    server_name .{{ domain }};
    ssl_certificate /etc/letsencrypt/live/{{ domain }}/fullchain.pem;
    ssl_certificate_key /etc/letsencrypt/live/{{ domain }}/privkey.pem;
    ssl_dhparam dh.pem;
    add_header X-Frame-Options "SAMEORIGIN";
    add_header X-Content-Type-Options "nosniff";
    add_header X-XSS-Protection "1; mode=block";
    add_header Strict-Transport-Security "max-age=31536000; includeSubdomains;";
    root /[path to root]/{{ domain }};
    location = /favicon.ico {
        log_not_found off;
        access_log off;
    }
    location = /robots.txt {
        log_not_found off;
        access_log off;
    }
    location ~ ^(?<script_name>.+?\.php)(?<path_info>.*)$ {
        try_files $uri $script_name =404;
        fastcgi_param SCRIPT_FILENAME $document_root$script_name;
        fastcgi_param SCRIPT_NAME $script_name;
        fastcgi_param PATH_INFO $path_info;
        fastcgi_param PATH_TRANSLATED $document_root$path_info;
        fastcgi_param QUERY_STRING $query_string;
        fastcgi_param REQUEST_METHOD $request_method;
        fastcgi_param CONTENT_TYPE $content_type;
        fastcgi_param CONTENT_LENGTH $content_length;
        fastcgi_param SCRIPT_NAME $script_name;
        fastcgi_param REQUEST_URI $request_uri;
        fastcgi_param DOCUMENT_URI $document_uri;
        fastcgi_param DOCUMENT_ROOT $document_root;
        fastcgi_param SERVER_PROTOCOL $server_protocol;
        fastcgi_param GATEWAY_INTERFACE CGI/1.1;
        fastcgi_param SERVER_SOFTWARE nginx;
        fastcgi_param REMOTE_ADDR $remote_addr;
        fastcgi_param REMOTE_PORT $remote_port;
        fastcgi_param SERVER_ADDR $server_addr;
        fastcgi_param SERVER_PORT $server_port;
        fastcgi_param SERVER_NAME $server_name;
        fastcgi_param HTTP_PROXY "";
        fastcgi_param HTTPS $https if_not_empty;
        fastcgi_param SSL_PROTOCOL $ssl_protocol if_not_empty;
        fastcgi_param SSL_CIPHER $ssl_cipher if_not_empty;
        fastcgi_param SSL_SESSION_ID $ssl_session_id if_not_empty;
        fastcgi_param SSL_CLIENT_VERIFY $ssl_client_verify if_not_empty;
        fastcgi_param REDIRECT_STATUS 200;
        fastcgi_index index.php;
        fastcgi_connect_timeout 10;
        fastcgi_send_timeout 360;
        fastcgi_read_timeout 3600;
        fastcgi_buffer_size 512k;
        fastcgi_buffers 512 512k;
        fastcgi_keep_conn on;
        fastcgi_intercept_errors on;
        fastcgi_split_path_info ^(?<script_name>.+?\.php)(?<path_info>.*)$;
        fastcgi_pass {{ domain }};
    }
    location / {
        try_files $uri $uri/ $uri.html $uri/index.html $uri/index.xml $uri/index.atom index.php @rewrites;
    }
    location @rewrites {
        rewrite ^ /magic.php?$args last;
    }
    location ~* \.(css|js|eot|woff|ttf|woff2)$ {
        expires 1d;
        add_header Cache-Control "public, must-revalidate, proxy-revalidate";
        add_header "Vary" "Accept-Encoding";
    }
    location ~* \.(png|ico|gif|svg|jpg|jpeg|webp|avi|mpg|mpeg|mp4|mp3)$ {
        expires 7d;
        add_header Cache-Control "public, must-revalidate, proxy-revalidate";
        add_header "Vary" "Accept-Encoding";
    }
 }
 ```
--- a/archive.py
+++ b/archive.py
@ -5,14 +5,16 @@ import glob
 import logging
 import shutil
 import subprocess
 import imghdr
 import arrow
 from pprint import pprint
 from requests_oauthlib import OAuth1Session, oauth1_session, OAuth2Session, oauth2_session
 from oauthlib.oauth2 import BackendApplicationClient
-
+import db
 import shared
 class Favs(object):
    def __init__(self, confgroup):
        self.confgroup = confgroup
@ -101,6 +103,7 @@ class FlickrFavs(Favs):
            fav = FlickrFav(photo)
            if not fav.exists:
                fav.run()
            #fav.fix_extension()
 class FivehpxFavs(Favs):
    def __init__(self):
@ -179,6 +182,7 @@ class FivehpxFavs(Favs):
            fav = FivehpxFav(photo)
            if not fav.exists:
                fav.run()
            #fav.fix_extension()
 class TumblrFavs(Favs):
@ -242,7 +246,7 @@ class DAFavs(Favs):
            'https://www.deviantart.com/api/v1/oauth2/collections/folders',
            params={
                'username': self.username,
-                'calculate_size': 'false',
+                'calculate_size': 'true',
                'ext_preload': 'false',
                'mature_content': 'true'
            }
@ -304,29 +308,29 @@ class DAFavs(Favs):
        has_more = self.has_more(js.get('has_more'))
        offset = js.get('next_offset')
        while True == has_more:
-            logging.info('iterating over DA results with offset %d', offset)
+            #logging.info('iterating over DA results with offset %d', offset)
            paged = self.getpaged(offset)
            new = paged.get('results', [])
            if not len(new):
                #logging.error('empty results from deviantART, breaking loop')
                break
-            favs = favs + new
+            favs = [*favs, *new]
            has_more = self.has_more(paged.get('has_more'))
            if not has_more:
                break
            n = int(paged.get('next_offset'))
            if not n:
                break
-            offset = offset + n
+            offset = n
        self.favs = favs
        for fav in self.favs:
            f = DAFav(fav)
-            if f.exists:
+            if not f.exists:
-                continue
+                f.fav.update({'meta': self.getsinglemeta(fav.get('deviationid'))})
                f.run()
            #f.fix_extension()
            f.fav.update({'meta': self.getsinglemeta(fav.get('deviationid'))})
            f.run()
 class ImgFav(object):
    def __init__(self):
@ -349,7 +353,19 @@ class ImgFav(object):
    @property
    def exists(self):
-        return os.path.exists(self.target)
+        maybe = glob.glob(self.target.replace('.jpg', '.*'))
        if len(maybe):
            return True
        return False
    def fix_extension(self):
        # identify file format
        what = imghdr.what(self.target)
        # rename file
        new = self.target.replace('.jpg', '.%s' % what)
        if new != self.target:
            shutil.move(self.target, new)
            self.target = new
    def pull_image(self):
        logging.info("pulling image %s to %s", self.imgurl, self.target)
@ -359,8 +375,11 @@ class ImgFav(object):
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)
    def write_exif(self):
        what = imghdr.what(self.target)
        if 'jpg' != what or 'png' != what:
            return
        logging.info('populating EXIF data of %s' % self.target)
        tags = list(set(self.meta.get('tags',[])))
        dt = self.meta.get('dt').to('utc')
@ -387,7 +406,7 @@ class ImgFav(object):
        params = [
            'exiftool',
            '-overwrite_original',
-            '-EXIF:Artist=%s' % author_name[:64],
+            #'-EXIF:Artist=%s' % author_name[:64],
            '-XMP:Copyright=Copyright %s %s (%s)' % (
                dt.format('YYYY'),
                author_name,
@ -501,6 +520,7 @@ class FlickrFav(ImgFav):
            self.photo.get('description', {}).get('_content', '')
        )
        self.fix_extension()
        self.write_exif()
 class FivehpxFav(ImgFav):
@ -546,12 +566,14 @@ class FivehpxFav(ImgFav):
        }
        c = "%s" % self.photo.get('description', '')
        self.content = shared.Pandoc('plain').convert(c)
        self.fix_extension()
        self.write_exif()
 class DAFav(ImgFav):
    def __init__(self, fav):
        self.fav = fav
        self.deviationid = fav.get('deviationid')
        #logging.info('working on %s', self.deviationid)
        self.url = fav.get('url')
        self.title = fav.get('title', False) or self.deviationid
        self.author = self.fav.get('author').get('username')
@ -562,9 +584,21 @@ class DAFav(ImgFav):
                shared.slugfname(self.author)
            )
        )
        self.imgurl = None
        if 'content' in fav:
            if 'src' in fav['content']:
                self.imgurl = fav.get('content').get('src')
        elif 'preview' in fav:
            if 'src' in fav['preview']:
                self.imgurl = fav.get('preview').get('src')
        self.imgurl = fav.get('content', {}).get('src')
    def run(self):
        if not self.imgurl:
            logging.error('imgurl is empty for deviantart %s', self.deviationid)
            return
        self.pull_image()
        self.meta = {
@ -583,6 +617,7 @@ class DAFav(ImgFav):
        }
        c = "%s" % self.fav.get('meta', {}).get('description', '')
        self.content = shared.Pandoc('plain').convert(c)
        self.fix_extension()
        self.write_exif()
@ -600,7 +635,10 @@ class TumblrFav(object):
    @property
    def exists(self):
-        return os.path.exists(self.target.replace('.jpg', '_0.jpg'))
+        maybe = glob.glob(self.target.replace('.jpg', '_0.*'))
        if len(maybe):
            return True
        return False
    def run(self):
        content = "%s" % self.like.get('caption', '')
@ -635,6 +673,7 @@ class TumblrFav(object):
            img.content = content
            img.meta = meta
            img.pull_image()
            img.fix_extension()
            img.write_exif()
            icntr = icntr + 1
@ -681,7 +720,7 @@ class Oauth1Flow(object):
        self.service = service
        self.key = shared.config.get("api_%s" % service, 'api_key')
        self.secret = shared.config.get("api_%s" % service, 'api_secret')
-        self.tokendb = shared.TokenDB()
+        self.tokendb = db.TokenDB()
        self.t = self.tokendb.get_service(self.service)
        self.oauth_init()
@ -796,7 +835,7 @@ class TumblrOauth(Oauth1Flow):
 if __name__ == '__main__':
-    logging.basicConfig(level=10)
+    logging.basicConfig(level=20)
    flickr = FlickrFavs()
    flickr.run()
--- a/db.py
+++ b/db.py
@ -0,0 +1,234 @@
 import os
 import json
 import sqlite3
 import glob
 import shared
 # TODO sqlite3 cache instead of filesystem ?
 class TokenDB(object):
    def __init__(self, uuid='tokens'):
        self.db = shared.config.get('var', 'tokendb')
        self.tokens = {}
        self.refresh()
    def refresh(self):
        self.tokens = {}
        if os.path.isfile(self.db):
            with open(self.db, 'rt') as f:
                self.tokens = json.loads(f.read())
    def save(self):
        with open(self.db, 'wt') as f:
            f.write(json.dumps(
                self.tokens, indent=4, sort_keys=True
            ))
    def get_token(self, token):
        return self.tokens.get(token, None)
    def get_service(self, service):
        token = self.tokens.get(service, None)
        return token
    def set_service(self, service, tokenid):
        self.tokens.update({
            service: tokenid
        })
        self.save()
    def update_token(self,
        token,
        oauth_token_secret=None,
        access_token=None,
        access_token_secret=None,
        verifier=None):
        t = self.tokens.get(token, {})
        if oauth_token_secret:
            t.update({
                'oauth_token_secret': oauth_token_secret
            })
        if access_token:
            t.update({
                'access_token': access_token
            })
        if access_token_secret:
            t.update({
                'access_token_secret': access_token_secret
            })
        if verifier:
            t.update({
                'verifier': verifier
            })
        self.tokens.update({
            token: t
        })
        self.save()
    def clear(self):
        self.tokens = {}
        self.save()
    def clear_service(self, service):
        t = self.tokens.get(service)
        if t:
            del(self.tokens[t])
        del(self.tokens[service])
        self.save()
 class SearchDB(object):
    tmplfile = 'Search.html'
    def __init__(self):
        self.db = sqlite3.connect(
            "%s" % shared.config.get('var', 'searchdb')
        )
        cursor = self.db.cursor()
        cursor.execute('''CREATE VIRTUAL TABLE IF NOT EXISTS data USING FTS5(
                id,
                corpus,
                mtime,
                url,
                category,
                title
            )''')
        self.db.commit()
    def __exit__(self):
        self.finish()
    def finish(self):
        self.db.close()
    def append(self, id, corpus, mtime, url, category, title):
        mtime = int(mtime)
        cursor = self.db.cursor()
        cursor.execute('''UPDATE data SET corpus=?, mtime=?, url=?, category=?, title=? WHERE id=?;''', (
            corpus,
            mtime,
            url,
            category,
            title,
            id
        ))
        cursor.execute('''INSERT OR IGNORE INTO data (id, corpus, mtime, url, category, title) VALUES (?,?,?,?,?,?);''', (
            id,
            corpus,
            mtime,
            url,
            category,
            title
        ))
        self.db.commit()
    def is_uptodate(self, fname, mtime):
        ret = {}
        cursor = self.db.cursor()
        cursor.execute('''SELECT mtime
            FROM data
            WHERE id = ? AND mtime = ?''',
            (fname,mtime)
        )
        rows = cursor.fetchall()
        if len(rows):
            return True
        return False
    def search_by_query(self, query):
        ret = {}
        cursor = self.db.cursor()
        cursor.execute('''SELECT
            id, category, url, title, highlight(data, 0, '<strong>', '</strong>') corpus
            FROM data
            WHERE data MATCH ?
            ORDER BY category, rank;''', (query,))
        rows = cursor.fetchall()
        for r in rows:
            r = {
                'id': r[0],
                'category': r[1],
                'url': r[2],
                'title': r[3],
                'txt': r[4],
            }
            category = r.get('category')
            if category not in ret:
                ret.update({category: {}})
            maybe_fpath = os.path.join(
                shared.config.get('dirs', 'content'),
                category,
                "%s.*" % r.get('id')
            )
            #fpath = glob.glob(maybe_fpath).pop()
            ret.get(category).update({
                r.get('id'): {
                    #'fpath': fpath,
                    'url': r.get('url'),
                    'title': r.get('title'),
                    'txt': r.get('txt')
                }
            })
        return ret
    def cli(self, query):
        results = self.search_by_query(query)
        for c, items in sorted(results.items()):
            print("%s:" % c)
            for fname, data in sorted(items.items()):
                print("  %s" % data.get('fpath'))
                print("  %s" % data.get('url'))
                print("")
    def html(self, query):
        tmplvars = {
            'results': self.search_by_query(query),
            'term': query
        }
        return shared.j2.get_template(self.tmplfile).render(tmplvars)
 class WebmentionQueue(object):
    def __init__(self):
        self.db = sqlite3.connect(
            "%s" % shared.config.get('var', 'webmentiondb')
        )
        cursor = self.db.cursor()
        cursor.execute('''CREATE TABLE IF NOT EXISTS `archive` (
            `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
            `received` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
            `processed` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
            `source` TEXT NOT NULL,
            `target` TEXT NOT NULL
        );''');
        cursor.execute('''CREATE TABLE IF NOT EXISTS `queue` (
            `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
            `timestamp` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
            `source` TEXT NOT NULL,
            `target` TEXT NOT NULL
        );''');
        self.db.commit()
    def __exit__(self):
        self.finish()
    def finish(self):
        self.db.close()
    def queue(self, source, target):
        cursor = self.db.cursor()
        cursor.execute(
            '''INSERT INTO queue (source,target) VALUES (?,?);''', (
                source,
                target
            )
        )
        self.db.commit()
--- a/nasg.py
+++ b/nasg.py
--- a/requirements.txt
+++ b/requirements.txt
@ -1,30 +1,8 @@
 aiofiles==0.3.1
 appdirs==1.4.3
 arrow==0.10.0
 breadability==0.1.20
 chardet==3.0.3
 decorator==4.0.11
 docopt==0.6.2
 httptools==0.0.9
 Jinja2==2.9.6
 langdetect==1.0.7
-lxml==3.7.3
+requests==2.12.4
-MarkupSafe==1.0
+requests-oauthlib==0.8.0
-packaging==16.8
+sanic==0.6.0
 pyparsing==2.2.0
 python-dateutil==2.6.0
 python-frontmatter==0.4.2
 python-magic==0.4.13
 PyYAML==3.12
 requests==2.14.2
 sanic==0.5.4
 similar-text==0.2.0
 six==1.10.0
 ujson==1.35
 unicode-slugify==0.1.3
 Unidecode==0.4.20
 uvloop==0.8.0
 validators==0.11.3
 Wand==0.4.4
 websockets==3.3
 Whoosh==2.7.4
--- a/router.py
+++ b/router.py
@ -0,0 +1,86 @@
 #!/usr/bin/env python3
 #import asyncio
 #import uvloop
 from sanic import Sanic
 import sanic.response
 import logging
 import db
 import shared
 import validators
 import urllib.parse
 if __name__ == '__main__':
    logging_format = "[%(asctime)s] %(process)d-%(levelname)s "
    logging_format += "%(module)s::%(funcName)s():l%(lineno)d: "
    logging_format += "%(message)s"
    logging.basicConfig(
        format=logging_format,
        level=logging.DEBUG
    )
    log = logging.getLogger()
    # log_config=None prevents creation of access_log and error_log files
    # since I'm running this from systemctl it already goes into syslog
    app = Sanic('router', log_config=None)
    # this is ok to be read-only
    sdb = db.SearchDB()
    @app.route("/oauth1", methods=["GET"])
    async def oauth1(request):
        token = request.args.get('oauth_token')
        verifier = request.args.get('oauth_verifier')
        tokendb = shared.TokenDB()
        tokendb.update_token(
            token,
            verifier=verifier
        )
        return sanic.response.text("OK",status=200)
    @app.route("/search", methods=["GET"])
    async def search(request):
        query = request.args.get('s')
        r = sdb.html(query)
        response = sanic.response.html(r, status=200)
        return response
    @app.route("/micropub", methods=["POST","GET"])
    async def micropub(request):
        return sanic.response.text("Not Implemented", status=501)
    @app.route("/webmention", methods=["POST"])
    async def webmention(request):
        source = request.form.get('source')
        target = request.form.get('target')
        # validate urls
        if not validators.url(source):
            return sanic.response.text('Invalide source url', status=400)
        if not validators.url(target):
            return sanic.response.text('Invalide target url', status=400)
        # check if our site is actually the target for the webmention
        _target = urllib.parse.urlparse(target)
        if _target.hostname not in shared.config.get('site', 'domains'):
            return sanic.response.text('target domain is not me', status=400)
        # ignore selfpings
        _source = urllib.parse.urlparse(source)
        if _source.hostname in shared.config.get('site', 'domains'):
            return sanic.response.text('selfpings are not allowed', status=400)
        # it is unfortunate that I need to init this every time, but
        # otherwise it'll become read-only for reasons I'm yet to grasp
        # the actual parsing will be done at site generation time
        wdb = db.WebmentionQueue()
        wdb.queue(source,target)
        response = sanic.response.text("Accepted", status=202)
        return response
    app.run(host="127.0.0.1",port=8008, log_config=None)
--- a/shared.py
+++ b/shared.py
@ -5,131 +5,10 @@ import glob
 import logging
 import subprocess
 import json
-import requests
+import sqlite3
 from urllib.parse import urlparse, urlunparse
 from whoosh import fields
 from whoosh import analysis
 from slugify import slugify
-
+import jinja2
 LLEVEL = {
    'critical': 50,
    'error': 40,
    'warning': 30,
    'info': 20,
    'debug': 10
 }
 def __expandconfig(config):
    """ add the dirs to the config automatically """
    basepath = os.path.expanduser(config.get('common','base'))
    config.set('common', 'basedir', basepath)
    for section in ['source', 'target']:
        for option in config.options(section):
            opt = config.get(section, option)
            config.set(section, "%sdir" % option, os.path.join(basepath,opt))
    config.set('target', 'filesdir', os.path.join(
        config.get('target', 'builddir'),
        config.get('source', 'files'),
    ))
    config.set('target', 'commentsdir', os.path.join(
        config.get('target', 'builddir'),
        config.get('site', 'commentspath'),
    ))
    return config
 def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"):
    """ Used to create short, lowercase slug for a number (an epoch) passed """
    num = int(num)
    return ((num == 0) and numerals[0]) or (
        baseN(
            num // b,
            b,
            numerals
        ).lstrip(numerals[0]) + numerals[num % b]
    )
 def slugfname(url):
    return "%s" % slugify(
        re.sub(r"^https?://(?:www)?", "", url),
        only_ascii=True,
        lower=True
    )[:200]
 ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ'
 STRFISO = '%Y-%m-%dT%H:%M:%S%z'
 URLREGEX = re.compile(
    r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+'
    r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*'
 )
 EXIFREXEG = re.compile(
    r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+'
    r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$'
 )
 MDIMGREGEX = re.compile(
    r'(!\[(.*)\]\((?:\/(?:files|cache)'
    r'(?:\/[0-9]{4}\/[0-9]{2})?\/(.*\.(?:jpe?g|png|gif)))'
    r'(?:\s+[\'\"]?(.*?)[\'\"]?)?\)(?:\{(.*?)\})?)'
 , re.IGNORECASE)
 schema = fields.Schema(
    url=fields.ID(
        stored=True,
        unique=True
    ),
    category=fields.TEXT(
        stored=True,
    ),
    date=fields.DATETIME(
        stored=True,
        sortable=True
    ),
    title=fields.TEXT(
        stored=True,
        analyzer=analysis.FancyAnalyzer()
    ),
    weight=fields.NUMERIC(
        sortable=True
    ),
    img=fields.TEXT(
        stored=True
    ),
    content=fields.TEXT(
        stored=True,
        analyzer=analysis.FancyAnalyzer()
    ),
    fuzzy=fields.NGRAMWORDS(
        tokenizer=analysis.NgramTokenizer(4)
    ),
    mtime=fields.NUMERIC(
        stored=True
    )
    #slug=fields.NGRAMWORDS(
        #tokenizer=analysis.NgramTokenizer(4)
    #),
    #reactions=fields.NGRAMWORDS(
        #tokenizer=analysis.NgramTokenizer(4)
    #),
    #tags=fields.TEXT(
        #stored=False,
        #analyzer=analysis.KeywordAnalyzer(
            #lowercase=True,
            #commas=True
        #),
    #),
 )
 config = configparser.ConfigParser(
    interpolation=configparser.ExtendedInterpolation(),
    allow_no_value=True
 )
 config.read('config.ini')
 config = __expandconfig(config)
 class CMDLine(object):
    def __init__(self, executable):
@ -138,7 +17,6 @@ class CMDLine(object):
            raise OSError('No %s found in PATH!' % executable)
            return
    @staticmethod
    def _which(name):
        for d in os.environ['PATH'].split(':'):
@ -148,33 +26,6 @@ class CMDLine(object):
        return None
    def __enter__(self):
        self.process = subprocess.Popen(
            [self.executable, "-stay_open", "True",  "-@", "-"],
            universal_newlines=True,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )
        return self
    def  __exit__(self, exc_type, exc_value, traceback):
        self.process.stdin.write("-stay_open\nFalse\n")
        self.process.stdin.flush()
    def execute(self, *args):
        args = args + ("-execute\n",)
        self.process.stdin.write(str.join("\n", args))
        self.process.stdin.flush()
        output = ""
        fd = self.process.stdout.fileno()
        while not output.endswith(self.sentinel):
            output += os.read(fd, 4096).decode('utf-8', errors='ignore')
        return output[:-len(self.sentinel)]
 class Pandoc(CMDLine):
    """ Pandoc command line call with piped in- and output """
@ -254,23 +105,68 @@ class Pandoc(CMDLine):
        return stdout.decode('utf-8').strip()
-class HeadlessChromium(CMDLine):
+class ExifTool(CMDLine):
-    def __init__(self, url):
+    def __init__(self, fpath):
-        super().__init__('chromium-browser')
+        self.fpath = fpath
-        self.url = url
+        super().__init__('exiftool')
-    def get(self):
+    @staticmethod
    def exifdate2iso(value):
        """ converts and EXIF date string to ISO 8601 format
        :param value: EXIF date (2016:05:01 00:08:24)
        :type arg1: str
        :return: ISO 8601 string with UTC timezone 2016-05-01T00:08:24+0000
        :rtype: str
        """
        if not isinstance(value, str):
            return value
        match = REGEX['exifdate'].match(value)
        if not match:
            return value
        return "%s-%s-%sT%s+0000" % (
            match.group('year'),
            match.group('month'),
            match.group('day'),
            match.group('time')
        )
    def read(self):
        cmd = (
            self.executable,
-            '--headless',
+            '-sort',
-            '--disable-gpu',
+            '-json',
-            '--disable-preconnect',
+            '-MIMEType',
-            '--dump-dom',
+            '-FileType',
-            '--timeout 60',
+            '-FileName',
-            '--save-page-as-mhtml',
+            '-ModifyDate',
-            "%s" % self.url
+            '-CreateDate',
            '-DateTimeOriginal',
            '-ImageHeight',
            '-ImageWidth',
            '-Aperture',
            '-FOV',
            '-ISO',
            '-FocalLength',
            '-FNumber',
            '-FocalLengthIn35mmFormat',
            '-ExposureTime',
            '-Copyright',
            '-Artist',
            '-Model',
            '-GPSLongitude#',
            '-GPSLatitude#',
            '-LensID',
            '-LensSpec',
            '-Lens',
            '-ReleaseDate',
            '-Description',
            '-Headline',
            '-HierarchicalSubject',
            self.fpath
        )
-        logging.debug('getting URL %s with headless chrome', self.url)
+
        logging.debug('reading EXIF from %s', self.fpath)
        p = subprocess.Popen(
            cmd,
            stdin=subprocess.PIPE,
@ -280,113 +176,111 @@ class HeadlessChromium(CMDLine):
        stdout, stderr = p.communicate()
        if stderr:
-            logging.error(
+            logging.error("Error reading EXIF:\n\t%s\n\t%s", cmd, stderr)
-                "Error getting URL:\n\t%s\n\t%s",
+
-                cmd,
+        exif = json.loads(stdout.decode('utf-8').strip()).pop()
-                stderr
+        if 'ReleaseDate' in exif and 'ReleaseTime' in exif:
-            )
+            exif['DateTimeRelease'] = "%s %s" % (exif.get('ReleaseDate'), exif.get('ReleaseTime')[:8])
-        return stdout.decode('utf-8').strip()
+            del(exif['ReleaseDate'])
            del(exif['ReleaseTime'])
        for k, v in exif.items():
            exif[k] = self.exifdate2iso(v)
        return exif
 def __expandconfig():
    c = configparser.ConfigParser(
        interpolation=configparser.ExtendedInterpolation(),
        allow_no_value=True
    )
    c.read('config.ini')
    for s in c.sections():
        for o in c.options(s):
            curr = c.get(s, o)
            if 'photo' == s and 'regex' == o:
                REGEX.update({'photo': re.compile(curr)})
            c.set(s, o, os.path.expanduser(curr))
 def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"):
    """ Used to create short, lowercase slug for a number (an epoch) passed """
    num = int(num)
    return ((num == 0) and numerals[0]) or (
        baseN(
            num // b,
            b,
            numerals
        ).lstrip(numerals[0]) + numerals[num % b]
    )
-class wget(CMDLine):
+def slugfname(url):
-    def __init__(self, url, dirname=None):
+    return "%s" % slugify(
-        super().__init__('wget')
+        re.sub(r"^https?://(?:www)?", "", url),
-        self.url = url
+        only_ascii=True,
-        self.slug = dirname or slugfname(self.url)
+        lower=True
-        self.saveto = os.path.join(
+    )[:200]
            config.get('source', 'offlinecopiesdir'),
            self.slug
        )
-    def archive(self):
+def __setup_sitevars():
-        cmd = (
+    SiteVars = {}
-            self.executable,
+    section = 'site'
-            '-e',
+    for o in config.options(section):
-            'robots=off',
+        SiteVars.update({o: config.get(section, o)})
            '--timeout=360',
            '--no-clobber',
            '--no-directories',
            '--adjust-extension',
            '--span-hosts',
            '--wait=1',
            '--random-wait',
            '--convert-links',
            #'--backup-converted',
            '--page-requisites',
            '--directory-prefix=%s' % self.saveto,
            "%s" % self.url
        )
        logging.debug('getting URL %s with wget', self.url)
        p = subprocess.Popen(
            cmd,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
-        stdout, stderr = p.communicate()
+    # add site author
-        if stderr:
+    section = 'author'
-            logging.error(
+    SiteVars.update({section: {}})
-                "Error getting URL:\n\t%s\n\t%s",
+    for o in config.options(section):
-                cmd,
+        SiteVars[section].update({o: config.get(section, o)})
                stderr
            )
        return stdout.decode('utf-8').strip()
-def find_realurl(url):
+    # add extra sections to author
-    headers = requests.utils.default_headers()
+    for sub in config.get('author', 'appendwith').split():
-    headers.update({
+        SiteVars[section].update({sub: {}})
-        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
+        for o in config.options(sub):
-    })
+            SiteVars[section][sub].update({o: config.get(sub, o)})
-    try:
+    # push the whole thing into cache
-        r = requests.get(
+    return SiteVars
            url,
            allow_redirects=True,
            timeout=60,
            headers=headers
        )
    except Exception as e:
        logging.error('getting real url failed: %s', e)
        return (None, 400)
    finalurl = list(urlparse(r.url))
    finalurl[4] = '&'.join(
        [x for x in finalurl[4].split('&') if not x.startswith('utm_')])
    finalurl = urlunparse(finalurl)
-    return (finalurl, r.status_code)
+ARROWFORMAT = {
    'iso': 'YYYY-MM-DDTHH:mm:ssZ',
    'display': 'YYYY-MM-DD HH:mm'
 }
-def find_archiveorgurl(url):
+LLEVEL = {
-    url, status = find_realurl(url)
+    'critical': 50,
-    if status == requests.codes.ok:
+    'error': 40,
-        return url
+    'warning': 30,
    'info': 20,
    'debug': 10
 }
-    try:
+REGEX = {
-        a = requests.get(
+    'exifdate': re.compile(
-            "http://archive.org/wayback/available?url=%s" % url,
+        r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+'
-        )
+        r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$'
-    except Exception as e:
+    ),
-        logging.error('Failed to fetch archive.org availability for %s' % url)
+    'cleanurl': re.compile(r"^https?://(?:www)?"),
-        return None
+    'urls': re.compile(
        r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+'
        r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*'
    ),
    'mdimg': re.compile(
        r'(?P<shortcode>\!\[(?P<alt>[^\]]+)\]\((?P<fname>[^\s]+)'
        r'(?:\s[\'\"](?P<title>[^\"\']+)[\'\"])?\)(?:\{(?P<css>[^\}]+)\})?)',
        re.IGNORECASE
    )
 }
-    if not a:
+config = __expandconfig()
        logging.error('empty archive.org availability for %s' % url)
        return None
-    try:
+j2 = jinja2.Environment(
-        a = json.loads(a.text)
+    loader=jinja2.FileSystemLoader(
-        aurl = a.get(
+        searchpath=config.get('dirs', 'tmpl')
-            'archived_snapshots', {}
+    ),
-        ).get(
+    lstrip_blocks=True
-            'closest', {}
+)
        ).get(
            'url', None
        )
        if aurl:
            logging.debug("found %s in archive.org for %s", aurl, url)
            return aurl
    except Exception as e:
        logging.error("archive.org parsing failed: %s", e)
-    return None
+site = __setup_sitevars()