2.0-alpha1: tags dropped, favs dropped, bookmarks dropped, reposts dropped, better async rendering; TODO comments, websub pings, webmentions

2017-10-27 10:29:33 +01:00 · 2017-10-27 10:29:33 +01:00 · 4a699ef9f5
commit 4a699ef9f5
parent 112448cf92
7 changed files with 1495 additions and 2234 deletions
--- a/README.md
+++ b/README.md
@ -1,8 +1,166 @@
-# NASG: Not Another Statig Generator...
+# NASG (Not Another Statig Generator)

-So I ended up writing my static generator and this is (most) of the code for it.
+This is a tiny static site generator, written in Python, to scratch my own itches.
+It is most probably not suitable for anyone else.

-Don't expect anything fancy and please be aware that my Python Fu has much to learn.
+## Why not [insert static generator here]?

-I've written about the generic ideas and approaches here in my
-[Going Static](https://petermolnar.net/going-static/) entry.
+- DRY -Don't Repeat Yourself - is good, so instead of sidefiles for images, I'm using XMP metadata, which most of the ones availabe don't handle well;
+- writing a proper plugin to existing generators - Pelican, Nicola, etc - might have taken longer and I wanted to extend my Python knowledge
+- I wanted to use the best available utilities for some tasks, like `Pandoc` and  `exiftool` instead of Python libraries trying to achive the same
+- I needed to handle webmentions and comments
+
+Don't expect anything fancy: my Python Fu has much to learn.
+
+## How content is organized
+
+The directory structure of the "source" is something like this:
+```
+├── content
+│   ├── category1 (containing YAML + MD files)
+│   ├── category2 (containing YAML + MD files)
+│   ├── photo (containing jpg files)
+│   ├── _category_excluded_from_listing_1 (containing YAML + MD files)
+
+├── files
+│   ├── image (my own pictures)
+│   ├── photo -> ../content/photo
+│   └── pic (random images)
+├── nasg
+│   ├── archive.py
+│   ├── config.ini
+│   ├── db.py
+│   ├── LICENSE
+│   ├── nasg.py
+│   ├── README.md
+│   ├── requirements.txt
+│   ├── router.py
+│   ├── shared.py
+│   └── templates
+├── static
+│   ├── favicon.ico
+│   ├── favicon.png
+│   └── pgp.asc
+└── var
+    ├── gone.tsv
+    ├── redirects.tsv
+    ├── s.sqlite
+    ├── tokens.json
+    └── webmention.sqlite
+```
+
+Content files can be in either YAML and Markdown, with `.md` extension, or JPG with metadata, with `.jpg` extension.
+
+Inline images in the content are checked against all subdirectories in `files` ; they get their EXIF read and displayed as well if they match the regex in the configuration for the Artist and/or Copyright EXIF fields.
+
+`gone.tsv` is a simple list of URIs that should return a `410 Gone` message while `redirect.tsv` is a tab separated file of `from to` entries that should be `301` redirected. These go into a magic.php file, so if the host supports executing PHP, it will take care of this.
+
+## Output
+
+`nasg.py` generates a `build` directory which will have an directory per entry, with an `index.html`, so urls can be `https://domain.com/filename/`.
+
+Categories are rendered into `category/category_name`. Pagination is under `category/category_name/page/X`. They include a feed as well, `category/category_name/feed`, in form if an `index.atom` ATOM feed.
+
+## Webserver configuration
+
+A minimal nginx configuration for the virtualhost:
+```
+# --- Virtual Host ---
+upstream {{ domain }} {
+    server unix:/var/run/php/{{ domain }}.sock;
+}
+
+server {
+    listen 80;
+    server_name .{{ domain }};
+    rewrite ^ https://$server_name$request_uri redirect;
+    access_log  /dev/null;
+    error_log /dev/null;
+}
+
+server {
+    listen 443 ssl http2;
+    server_name .{{ domain }};
+    ssl_certificate /etc/letsencrypt/live/{{ domain }}/fullchain.pem;
+    ssl_certificate_key /etc/letsencrypt/live/{{ domain }}/privkey.pem;
+    ssl_dhparam dh.pem;
+    add_header X-Frame-Options "SAMEORIGIN";
+    add_header X-Content-Type-Options "nosniff";
+    add_header X-XSS-Protection "1; mode=block";
+    add_header Strict-Transport-Security "max-age=31536000; includeSubdomains;";
+
+    root /[path to root]/{{ domain }};
+
+    location = /favicon.ico {
+        log_not_found off;
+        access_log off;
+    }
+
+    location = /robots.txt {
+        log_not_found off;
+        access_log off;
+    }
+
+    location ~ ^(?<script_name>.+?\.php)(?<path_info>.*)$ {
+        try_files $uri $script_name =404;
+        fastcgi_param SCRIPT_FILENAME $document_root$script_name;
+        fastcgi_param SCRIPT_NAME $script_name;
+        fastcgi_param PATH_INFO $path_info;
+        fastcgi_param PATH_TRANSLATED $document_root$path_info;
+        fastcgi_param QUERY_STRING $query_string;
+        fastcgi_param REQUEST_METHOD $request_method;
+        fastcgi_param CONTENT_TYPE $content_type;
+        fastcgi_param CONTENT_LENGTH $content_length;
+        fastcgi_param SCRIPT_NAME $script_name;
+        fastcgi_param REQUEST_URI $request_uri;
+        fastcgi_param DOCUMENT_URI $document_uri;
+        fastcgi_param DOCUMENT_ROOT $document_root;
+        fastcgi_param SERVER_PROTOCOL $server_protocol;
+        fastcgi_param GATEWAY_INTERFACE CGI/1.1;
+        fastcgi_param SERVER_SOFTWARE nginx;
+        fastcgi_param REMOTE_ADDR $remote_addr;
+        fastcgi_param REMOTE_PORT $remote_port;
+        fastcgi_param SERVER_ADDR $server_addr;
+        fastcgi_param SERVER_PORT $server_port;
+        fastcgi_param SERVER_NAME $server_name;
+        fastcgi_param HTTP_PROXY "";
+        fastcgi_param HTTPS $https if_not_empty;
+        fastcgi_param SSL_PROTOCOL $ssl_protocol if_not_empty;
+        fastcgi_param SSL_CIPHER $ssl_cipher if_not_empty;
+        fastcgi_param SSL_SESSION_ID $ssl_session_id if_not_empty;
+        fastcgi_param SSL_CLIENT_VERIFY $ssl_client_verify if_not_empty;
+        fastcgi_param REDIRECT_STATUS 200;
+        fastcgi_index index.php;
+        fastcgi_connect_timeout 10;
+        fastcgi_send_timeout 360;
+        fastcgi_read_timeout 3600;
+        fastcgi_buffer_size 512k;
+        fastcgi_buffers 512 512k;
+        fastcgi_keep_conn on;
+        fastcgi_intercept_errors on;
+        fastcgi_split_path_info ^(?<script_name>.+?\.php)(?<path_info>.*)$;
+        fastcgi_pass {{ domain }};
+    }
+
+    location / {
+        try_files $uri $uri/ $uri.html $uri/index.html $uri/index.xml $uri/index.atom index.php @rewrites;
+    }
+
+    location @rewrites {
+        rewrite ^ /magic.php?$args last;
+    }
+
+    location ~* \.(css|js|eot|woff|ttf|woff2)$ {
+        expires 1d;
+        add_header Cache-Control "public, must-revalidate, proxy-revalidate";
+        add_header "Vary" "Accept-Encoding";
+    }
+
+    location ~* \.(png|ico|gif|svg|jpg|jpeg|webp|avi|mpg|mpeg|mp4|mp3)$ {
+        expires 7d;
+        add_header Cache-Control "public, must-revalidate, proxy-revalidate";
+        add_header "Vary" "Accept-Encoding";
+    }
+}
+
+```
--- a/archive.py
+++ b/archive.py
@ -5,14 +5,16 @@ import glob
 import logging
 import shutil
 import subprocess
+import imghdr
 import arrow

+from pprint import pprint
+
 from requests_oauthlib import OAuth1Session, oauth1_session, OAuth2Session, oauth2_session
 from oauthlib.oauth2 import BackendApplicationClient
-
+import db
 import shared

-
 class Favs(object):
    def __init__(self, confgroup):
        self.confgroup = confgroup
@ -101,6 +103,7 @@ class FlickrFavs(Favs):
            fav = FlickrFav(photo)
            if not fav.exists:
                fav.run()
+            #fav.fix_extension()

 class FivehpxFavs(Favs):
    def __init__(self):
@ -179,6 +182,7 @@ class FivehpxFavs(Favs):
            fav = FivehpxFav(photo)
            if not fav.exists:
                fav.run()
+            #fav.fix_extension()


 class TumblrFavs(Favs):
@ -242,7 +246,7 @@ class DAFavs(Favs):
            'https://www.deviantart.com/api/v1/oauth2/collections/folders',
            params={
                'username': self.username,
-                'calculate_size': 'false',
+                'calculate_size': 'true',
                'ext_preload': 'false',
                'mature_content': 'true'
            }
@ -304,29 +308,29 @@ class DAFavs(Favs):
        has_more = self.has_more(js.get('has_more'))
        offset = js.get('next_offset')
        while True == has_more:
-            logging.info('iterating over DA results with offset %d', offset)
+            #logging.info('iterating over DA results with offset %d', offset)
            paged = self.getpaged(offset)
            new = paged.get('results', [])
            if not len(new):
                #logging.error('empty results from deviantART, breaking loop')
                break
-            favs = favs + new
+            favs = [*favs, *new]
            has_more = self.has_more(paged.get('has_more'))
            if not has_more:
                break
            n = int(paged.get('next_offset'))
            if not n:
                break
-            offset = offset + n
+            offset = n

        self.favs = favs
        for fav in self.favs:
            f = DAFav(fav)
-            if f.exists:
-                continue
+            if not f.exists:
+                f.fav.update({'meta': self.getsinglemeta(fav.get('deviationid'))})
+                f.run()
+            #f.fix_extension()

-            f.fav.update({'meta': self.getsinglemeta(fav.get('deviationid'))})
-            f.run()

 class ImgFav(object):
    def __init__(self):
@ -349,7 +353,19 @@ class ImgFav(object):

    @property
    def exists(self):
-        return os.path.exists(self.target)
+        maybe = glob.glob(self.target.replace('.jpg', '.*'))
+        if len(maybe):
+            return True
+        return False
+
+    def fix_extension(self):
+        # identify file format
+        what = imghdr.what(self.target)
+        # rename file
+        new = self.target.replace('.jpg', '.%s' % what)
+        if new != self.target:
+            shutil.move(self.target, new)
+            self.target = new

    def pull_image(self):
        logging.info("pulling image %s to %s", self.imgurl, self.target)
@ -359,8 +375,11 @@ class ImgFav(object):
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)

-
    def write_exif(self):
+        what = imghdr.what(self.target)
+        if 'jpg' != what or 'png' != what:
+            return
+
        logging.info('populating EXIF data of %s' % self.target)
        tags = list(set(self.meta.get('tags',[])))
        dt = self.meta.get('dt').to('utc')
@ -387,7 +406,7 @@ class ImgFav(object):
        params = [
            'exiftool',
            '-overwrite_original',
-            '-EXIF:Artist=%s' % author_name[:64],
+            #'-EXIF:Artist=%s' % author_name[:64],
            '-XMP:Copyright=Copyright %s %s (%s)' % (
                dt.format('YYYY'),
                author_name,
@ -501,6 +520,7 @@ class FlickrFav(ImgFav):
            self.photo.get('description', {}).get('_content', '')
        )

+        self.fix_extension()
        self.write_exif()

 class FivehpxFav(ImgFav):
@ -546,12 +566,14 @@ class FivehpxFav(ImgFav):
        }
        c = "%s" % self.photo.get('description', '')
        self.content = shared.Pandoc('plain').convert(c)
+        self.fix_extension()
        self.write_exif()

 class DAFav(ImgFav):
    def __init__(self, fav):
        self.fav = fav
        self.deviationid = fav.get('deviationid')
+        #logging.info('working on %s', self.deviationid)
        self.url = fav.get('url')
        self.title = fav.get('title', False) or self.deviationid
        self.author = self.fav.get('author').get('username')
@ -562,9 +584,21 @@ class DAFav(ImgFav):
                shared.slugfname(self.author)
            )
        )
+
+        self.imgurl = None
+        if 'content' in fav:
+            if 'src' in fav['content']:
+                self.imgurl = fav.get('content').get('src')
+        elif 'preview' in fav:
+            if 'src' in fav['preview']:
+                self.imgurl = fav.get('preview').get('src')
        self.imgurl = fav.get('content', {}).get('src')

    def run(self):
+        if not self.imgurl:
+            logging.error('imgurl is empty for deviantart %s', self.deviationid)
+            return
+
        self.pull_image()

        self.meta = {
@ -583,6 +617,7 @@ class DAFav(ImgFav):
        }
        c = "%s" % self.fav.get('meta', {}).get('description', '')
        self.content = shared.Pandoc('plain').convert(c)
+        self.fix_extension()
        self.write_exif()


@ -600,7 +635,10 @@ class TumblrFav(object):

    @property
    def exists(self):
-        return os.path.exists(self.target.replace('.jpg', '_0.jpg'))
+        maybe = glob.glob(self.target.replace('.jpg', '_0.*'))
+        if len(maybe):
+            return True
+        return False

    def run(self):
        content = "%s" % self.like.get('caption', '')
@ -635,6 +673,7 @@ class TumblrFav(object):
            img.content = content
            img.meta = meta
            img.pull_image()
+            img.fix_extension()
            img.write_exif()
            icntr = icntr + 1

@ -681,7 +720,7 @@ class Oauth1Flow(object):
        self.service = service
        self.key = shared.config.get("api_%s" % service, 'api_key')
        self.secret = shared.config.get("api_%s" % service, 'api_secret')
-        self.tokendb = shared.TokenDB()
+        self.tokendb = db.TokenDB()
        self.t = self.tokendb.get_service(self.service)
        self.oauth_init()

@ -796,7 +835,7 @@ class TumblrOauth(Oauth1Flow):


 if __name__ == '__main__':
-    logging.basicConfig(level=10)
+    logging.basicConfig(level=20)

    flickr = FlickrFavs()
    flickr.run()
--- a/db.py
+++ b/db.py
@ -0,0 +1,234 @@
+import os
+import json
+import sqlite3
+import glob
+import shared
+
+# TODO sqlite3 cache instead of filesystem ?
+
+class TokenDB(object):
+    def __init__(self, uuid='tokens'):
+        self.db = shared.config.get('var', 'tokendb')
+        self.tokens = {}
+        self.refresh()
+
+    def refresh(self):
+        self.tokens = {}
+        if os.path.isfile(self.db):
+            with open(self.db, 'rt') as f:
+                self.tokens = json.loads(f.read())
+
+    def save(self):
+        with open(self.db, 'wt') as f:
+            f.write(json.dumps(
+                self.tokens, indent=4, sort_keys=True
+            ))
+
+    def get_token(self, token):
+        return self.tokens.get(token, None)
+
+    def get_service(self, service):
+        token = self.tokens.get(service, None)
+        return token
+
+    def set_service(self, service, tokenid):
+        self.tokens.update({
+            service: tokenid
+        })
+        self.save()
+
+    def update_token(self,
+        token,
+        oauth_token_secret=None,
+        access_token=None,
+        access_token_secret=None,
+        verifier=None):
+
+        t = self.tokens.get(token, {})
+        if oauth_token_secret:
+            t.update({
+                'oauth_token_secret': oauth_token_secret
+            })
+        if access_token:
+            t.update({
+                'access_token': access_token
+            })
+        if access_token_secret:
+            t.update({
+                'access_token_secret': access_token_secret
+            })
+        if verifier:
+            t.update({
+                'verifier': verifier
+            })
+
+        self.tokens.update({
+            token: t
+        })
+        self.save()
+
+    def clear(self):
+        self.tokens = {}
+        self.save()
+
+    def clear_service(self, service):
+        t = self.tokens.get(service)
+        if t:
+            del(self.tokens[t])
+        del(self.tokens[service])
+        self.save()
+
+class SearchDB(object):
+    tmplfile = 'Search.html'
+
+    def __init__(self):
+        self.db = sqlite3.connect(
+            "%s" % shared.config.get('var', 'searchdb')
+        )
+
+        cursor = self.db.cursor()
+        cursor.execute('''CREATE VIRTUAL TABLE IF NOT EXISTS data USING FTS5(
+                id,
+                corpus,
+                mtime,
+                url,
+                category,
+                title
+            )''')
+        self.db.commit()
+
+    def __exit__(self):
+        self.finish()
+
+    def finish(self):
+        self.db.close()
+
+    def append(self, id, corpus, mtime, url, category, title):
+        mtime = int(mtime)
+        cursor = self.db.cursor()
+        cursor.execute('''UPDATE data SET corpus=?, mtime=?, url=?, category=?, title=? WHERE id=?;''', (
+            corpus,
+            mtime,
+            url,
+            category,
+            title,
+            id
+        ))
+        cursor.execute('''INSERT OR IGNORE INTO data (id, corpus, mtime, url, category, title) VALUES (?,?,?,?,?,?);''', (
+            id,
+            corpus,
+            mtime,
+            url,
+            category,
+            title
+        ))
+        self.db.commit()
+
+    def is_uptodate(self, fname, mtime):
+        ret = {}
+        cursor = self.db.cursor()
+        cursor.execute('''SELECT mtime
+            FROM data
+            WHERE id = ? AND mtime = ?''',
+            (fname,mtime)
+        )
+        rows = cursor.fetchall()
+        if len(rows):
+            return True
+        return False
+
+    def search_by_query(self, query):
+        ret = {}
+        cursor = self.db.cursor()
+        cursor.execute('''SELECT
+            id, category, url, title, highlight(data, 0, '<strong>', '</strong>') corpus
+            FROM data
+            WHERE data MATCH ?
+            ORDER BY category, rank;''', (query,))
+        rows = cursor.fetchall()
+        for r in rows:
+            r = {
+                'id': r[0],
+                'category': r[1],
+                'url': r[2],
+                'title': r[3],
+                'txt': r[4],
+            }
+
+            category = r.get('category')
+            if category not in ret:
+                ret.update({category: {}})
+
+
+            maybe_fpath = os.path.join(
+                shared.config.get('dirs', 'content'),
+                category,
+                "%s.*" % r.get('id')
+            )
+            #fpath = glob.glob(maybe_fpath).pop()
+            ret.get(category).update({
+                r.get('id'): {
+                    #'fpath': fpath,
+                    'url': r.get('url'),
+                    'title': r.get('title'),
+                    'txt': r.get('txt')
+                }
+            })
+        return ret
+
+
+    def cli(self, query):
+        results = self.search_by_query(query)
+        for c, items in sorted(results.items()):
+            print("%s:" % c)
+            for fname, data in sorted(items.items()):
+                print("  %s" % data.get('fpath'))
+                print("  %s" % data.get('url'))
+                print("")
+
+    def html(self, query):
+        tmplvars = {
+            'results': self.search_by_query(query),
+            'term': query
+        }
+        return shared.j2.get_template(self.tmplfile).render(tmplvars)
+
+
+class WebmentionQueue(object):
+    def __init__(self):
+        self.db = sqlite3.connect(
+            "%s" % shared.config.get('var', 'webmentiondb')
+        )
+
+        cursor = self.db.cursor()
+        cursor.execute('''CREATE TABLE IF NOT EXISTS `archive` (
+            `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
+            `received` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+            `processed` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+            `source` TEXT NOT NULL,
+            `target` TEXT NOT NULL
+        );''');
+
+        cursor.execute('''CREATE TABLE IF NOT EXISTS `queue` (
+            `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
+            `timestamp` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+            `source` TEXT NOT NULL,
+            `target` TEXT NOT NULL
+        );''');
+        self.db.commit()
+
+    def __exit__(self):
+        self.finish()
+
+    def finish(self):
+        self.db.close()
+
+    def queue(self, source, target):
+        cursor = self.db.cursor()
+        cursor.execute(
+            '''INSERT INTO queue (source,target) VALUES (?,?);''', (
+                source,
+                target
+            )
+        )
+        self.db.commit()
--- a/nasg.py
+++ b/nasg.py
--- a/requirements.txt
+++ b/requirements.txt
@ -1,30 +1,8 @@
-aiofiles==0.3.1
-appdirs==1.4.3
 arrow==0.10.0
-breadability==0.1.20
-chardet==3.0.3
-decorator==4.0.11
-docopt==0.6.2
-httptools==0.0.9
 Jinja2==2.9.6
 langdetect==1.0.7
-lxml==3.7.3
-MarkupSafe==1.0
-packaging==16.8
-pyparsing==2.2.0
-python-dateutil==2.6.0
-python-frontmatter==0.4.2
-python-magic==0.4.13
-PyYAML==3.12
-requests==2.14.2
-sanic==0.5.4
-similar-text==0.2.0
-six==1.10.0
-ujson==1.35
+requests==2.12.4
+requests-oauthlib==0.8.0
+sanic==0.6.0
 unicode-slugify==0.1.3
-Unidecode==0.4.20
-uvloop==0.8.0
-validators==0.11.3
 Wand==0.4.4
-websockets==3.3
-Whoosh==2.7.4
--- a/router.py
+++ b/router.py
@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+
+#import asyncio
+#import uvloop
+from sanic import Sanic
+import sanic.response
+import logging
+import db
+import shared
+import validators
+import urllib.parse
+
+if __name__ == '__main__':
+    logging_format = "[%(asctime)s] %(process)d-%(levelname)s "
+    logging_format += "%(module)s::%(funcName)s():l%(lineno)d: "
+    logging_format += "%(message)s"
+
+    logging.basicConfig(
+        format=logging_format,
+        level=logging.DEBUG
+    )
+    log = logging.getLogger()
+
+    # log_config=None prevents creation of access_log and error_log files
+    # since I'm running this from systemctl it already goes into syslog
+    app = Sanic('router', log_config=None)
+    # this is ok to be read-only
+    sdb = db.SearchDB()
+
+
+    @app.route("/oauth1", methods=["GET"])
+    async def oauth1(request):
+        token = request.args.get('oauth_token')
+        verifier = request.args.get('oauth_verifier')
+        tokendb = shared.TokenDB()
+        tokendb.update_token(
+            token,
+            verifier=verifier
+        )
+        return sanic.response.text("OK",status=200)
+
+
+    @app.route("/search", methods=["GET"])
+    async def search(request):
+        query = request.args.get('s')
+        r = sdb.html(query)
+        response = sanic.response.html(r, status=200)
+        return response
+
+
+    @app.route("/micropub", methods=["POST","GET"])
+    async def micropub(request):
+        return sanic.response.text("Not Implemented", status=501)
+
+
+    @app.route("/webmention", methods=["POST"])
+    async def webmention(request):
+        source = request.form.get('source')
+        target = request.form.get('target')
+
+        # validate urls
+        if not validators.url(source):
+            return sanic.response.text('Invalide source url', status=400)
+        if not validators.url(target):
+            return sanic.response.text('Invalide target url', status=400)
+
+        # check if our site is actually the target for the webmention
+        _target = urllib.parse.urlparse(target)
+        if _target.hostname not in shared.config.get('site', 'domains'):
+            return sanic.response.text('target domain is not me', status=400)
+
+        # ignore selfpings
+        _source = urllib.parse.urlparse(source)
+        if _source.hostname in shared.config.get('site', 'domains'):
+            return sanic.response.text('selfpings are not allowed', status=400)
+
+        # it is unfortunate that I need to init this every time, but
+        # otherwise it'll become read-only for reasons I'm yet to grasp
+        # the actual parsing will be done at site generation time
+        wdb = db.WebmentionQueue()
+        wdb.queue(source,target)
+        response = sanic.response.text("Accepted", status=202)
+        return response
+
+
+    app.run(host="127.0.0.1",port=8008, log_config=None)
--- a/shared.py
+++ b/shared.py
@ -5,131 +5,10 @@ import glob
 import logging
 import subprocess
 import json
-import requests
-from urllib.parse import urlparse, urlunparse
+import sqlite3

-from whoosh import fields
-from whoosh import analysis
 from slugify import slugify
-
-LLEVEL = {
-    'critical': 50,
-    'error': 40,
-    'warning': 30,
-    'info': 20,
-    'debug': 10
-}
-
-
-def __expandconfig(config):
-    """ add the dirs to the config automatically """
-    basepath = os.path.expanduser(config.get('common','base'))
-    config.set('common', 'basedir', basepath)
-    for section in ['source', 'target']:
-        for option in config.options(section):
-            opt = config.get(section, option)
-            config.set(section, "%sdir" % option, os.path.join(basepath,opt))
-    config.set('target', 'filesdir', os.path.join(
-        config.get('target', 'builddir'),
-        config.get('source', 'files'),
-    ))
-    config.set('target', 'commentsdir', os.path.join(
-        config.get('target', 'builddir'),
-        config.get('site', 'commentspath'),
-    ))
-    return config
-
-
-def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"):
-    """ Used to create short, lowercase slug for a number (an epoch) passed """
-    num = int(num)
-    return ((num == 0) and numerals[0]) or (
-        baseN(
-            num // b,
-            b,
-            numerals
-        ).lstrip(numerals[0]) + numerals[num % b]
-    )
-
-def slugfname(url):
-    return "%s" % slugify(
-        re.sub(r"^https?://(?:www)?", "", url),
-        only_ascii=True,
-        lower=True
-    )[:200]
-
-ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ'
-STRFISO = '%Y-%m-%dT%H:%M:%S%z'
-
-URLREGEX = re.compile(
-    r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+'
-    r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*'
-)
-
-EXIFREXEG = re.compile(
-    r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+'
-    r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$'
-)
-
-MDIMGREGEX = re.compile(
-    r'(!\[(.*)\]\((?:\/(?:files|cache)'
-    r'(?:\/[0-9]{4}\/[0-9]{2})?\/(.*\.(?:jpe?g|png|gif)))'
-    r'(?:\s+[\'\"]?(.*?)[\'\"]?)?\)(?:\{(.*?)\})?)'
-, re.IGNORECASE)
-
-schema = fields.Schema(
-    url=fields.ID(
-        stored=True,
-        unique=True
-    ),
-    category=fields.TEXT(
-        stored=True,
-    ),
-    date=fields.DATETIME(
-        stored=True,
-        sortable=True
-    ),
-    title=fields.TEXT(
-        stored=True,
-        analyzer=analysis.FancyAnalyzer()
-    ),
-    weight=fields.NUMERIC(
-        sortable=True
-    ),
-    img=fields.TEXT(
-        stored=True
-    ),
-    content=fields.TEXT(
-        stored=True,
-        analyzer=analysis.FancyAnalyzer()
-    ),
-    fuzzy=fields.NGRAMWORDS(
-        tokenizer=analysis.NgramTokenizer(4)
-    ),
-    mtime=fields.NUMERIC(
-        stored=True
-    )
-    #slug=fields.NGRAMWORDS(
-        #tokenizer=analysis.NgramTokenizer(4)
-    #),
-    #reactions=fields.NGRAMWORDS(
-        #tokenizer=analysis.NgramTokenizer(4)
-    #),
-    #tags=fields.TEXT(
-        #stored=False,
-        #analyzer=analysis.KeywordAnalyzer(
-            #lowercase=True,
-            #commas=True
-        #),
-    #),
-)
-
-config = configparser.ConfigParser(
-    interpolation=configparser.ExtendedInterpolation(),
-    allow_no_value=True
-)
-config.read('config.ini')
-config = __expandconfig(config)
+import jinja2

 class CMDLine(object):
    def __init__(self, executable):
@ -138,7 +17,6 @@ class CMDLine(object):
            raise OSError('No %s found in PATH!' % executable)
            return

-
    @staticmethod
    def _which(name):
        for d in os.environ['PATH'].split(':'):
@ -148,33 +26,6 @@ class CMDLine(object):
        return None


-    def __enter__(self):
-        self.process = subprocess.Popen(
-            [self.executable, "-stay_open", "True",  "-@", "-"],
-            universal_newlines=True,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE
-        )
-        return self
-
-
-    def  __exit__(self, exc_type, exc_value, traceback):
-        self.process.stdin.write("-stay_open\nFalse\n")
-        self.process.stdin.flush()
-
-
-    def execute(self, *args):
-        args = args + ("-execute\n",)
-        self.process.stdin.write(str.join("\n", args))
-        self.process.stdin.flush()
-        output = ""
-        fd = self.process.stdout.fileno()
-        while not output.endswith(self.sentinel):
-            output += os.read(fd, 4096).decode('utf-8', errors='ignore')
-        return output[:-len(self.sentinel)]
-
-
 class Pandoc(CMDLine):
    """ Pandoc command line call with piped in- and output """

@ -254,23 +105,68 @@ class Pandoc(CMDLine):
        return stdout.decode('utf-8').strip()


-class HeadlessChromium(CMDLine):
-    def __init__(self, url):
-        super().__init__('chromium-browser')
-        self.url = url
+class ExifTool(CMDLine):
+    def __init__(self, fpath):
+        self.fpath = fpath
+        super().__init__('exiftool')

-    def get(self):
+    @staticmethod
+    def exifdate2iso(value):
+        """ converts and EXIF date string to ISO 8601 format
+
+        :param value: EXIF date (2016:05:01 00:08:24)
+        :type arg1: str
+        :return: ISO 8601 string with UTC timezone 2016-05-01T00:08:24+0000
+        :rtype: str
+        """
+        if not isinstance(value, str):
+            return value
+        match = REGEX['exifdate'].match(value)
+        if not match:
+            return value
+        return "%s-%s-%sT%s+0000" % (
+            match.group('year'),
+            match.group('month'),
+            match.group('day'),
+            match.group('time')
+        )
+
+    def read(self):
        cmd = (
            self.executable,
-            '--headless',
-            '--disable-gpu',
-            '--disable-preconnect',
-            '--dump-dom',
-            '--timeout 60',
-            '--save-page-as-mhtml',
-            "%s" % self.url
+            '-sort',
+            '-json',
+            '-MIMEType',
+            '-FileType',
+            '-FileName',
+            '-ModifyDate',
+            '-CreateDate',
+            '-DateTimeOriginal',
+            '-ImageHeight',
+            '-ImageWidth',
+            '-Aperture',
+            '-FOV',
+            '-ISO',
+            '-FocalLength',
+            '-FNumber',
+            '-FocalLengthIn35mmFormat',
+            '-ExposureTime',
+            '-Copyright',
+            '-Artist',
+            '-Model',
+            '-GPSLongitude#',
+            '-GPSLatitude#',
+            '-LensID',
+            '-LensSpec',
+            '-Lens',
+            '-ReleaseDate',
+            '-Description',
+            '-Headline',
+            '-HierarchicalSubject',
+            self.fpath
        )
-        logging.debug('getting URL %s with headless chrome', self.url)
+
+        logging.debug('reading EXIF from %s', self.fpath)
        p = subprocess.Popen(
            cmd,
            stdin=subprocess.PIPE,
@ -280,113 +176,111 @@ class HeadlessChromium(CMDLine):

        stdout, stderr = p.communicate()
        if stderr:
-            logging.error(
-                "Error getting URL:\n\t%s\n\t%s",
-                cmd,
-                stderr
-            )
-        return stdout.decode('utf-8').strip()
+            logging.error("Error reading EXIF:\n\t%s\n\t%s", cmd, stderr)
+
+        exif = json.loads(stdout.decode('utf-8').strip()).pop()
+        if 'ReleaseDate' in exif and 'ReleaseTime' in exif:
+            exif['DateTimeRelease'] = "%s %s" % (exif.get('ReleaseDate'), exif.get('ReleaseTime')[:8])
+            del(exif['ReleaseDate'])
+            del(exif['ReleaseTime'])
+
+        for k, v in exif.items():
+            exif[k] = self.exifdate2iso(v)
+
+        return exif
+
+def __expandconfig():
+    c = configparser.ConfigParser(
+        interpolation=configparser.ExtendedInterpolation(),
+        allow_no_value=True
+    )
+    c.read('config.ini')
+
+    for s in c.sections():
+        for o in c.options(s):
+            curr = c.get(s, o)
+            if 'photo' == s and 'regex' == o:
+                REGEX.update({'photo': re.compile(curr)})
+            c.set(s, o, os.path.expanduser(curr))
+
+def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"):
+    """ Used to create short, lowercase slug for a number (an epoch) passed """
+    num = int(num)
+    return ((num == 0) and numerals[0]) or (
+        baseN(
+            num // b,
+            b,
+            numerals
+        ).lstrip(numerals[0]) + numerals[num % b]
+    )


-class wget(CMDLine):
-    def __init__(self, url, dirname=None):
-        super().__init__('wget')
-        self.url = url
-        self.slug = dirname or slugfname(self.url)
-        self.saveto = os.path.join(
-            config.get('source', 'offlinecopiesdir'),
-            self.slug
-        )
+def slugfname(url):
+    return "%s" % slugify(
+        re.sub(r"^https?://(?:www)?", "", url),
+        only_ascii=True,
+        lower=True
+    )[:200]

-    def archive(self):
-        cmd = (
-            self.executable,
-            '-e',
-            'robots=off',
-            '--timeout=360',
-            '--no-clobber',
-            '--no-directories',
-            '--adjust-extension',
-            '--span-hosts',
-            '--wait=1',
-            '--random-wait',
-            '--convert-links',
-            #'--backup-converted',
-            '--page-requisites',
-            '--directory-prefix=%s' % self.saveto,
-            "%s" % self.url
-        )
-        logging.debug('getting URL %s with wget', self.url)
-        p = subprocess.Popen(
-            cmd,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
+def __setup_sitevars():
+    SiteVars = {}
+    section = 'site'
+    for o in config.options(section):
+        SiteVars.update({o: config.get(section, o)})

-        stdout, stderr = p.communicate()
-        if stderr:
-            logging.error(
-                "Error getting URL:\n\t%s\n\t%s",
-                cmd,
-                stderr
-            )
-        return stdout.decode('utf-8').strip()
+    # add site author
+    section = 'author'
+    SiteVars.update({section: {}})
+    for o in config.options(section):
+        SiteVars[section].update({o: config.get(section, o)})

-def find_realurl(url):
-    headers = requests.utils.default_headers()
-    headers.update({
-        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
-    })
+    # add extra sections to author
+    for sub in config.get('author', 'appendwith').split():
+        SiteVars[section].update({sub: {}})
+        for o in config.options(sub):
+            SiteVars[section][sub].update({o: config.get(sub, o)})

-    try:
-        r = requests.get(
-            url,
-            allow_redirects=True,
-            timeout=60,
-            headers=headers
-        )
-    except Exception as e:
-        logging.error('getting real url failed: %s', e)
-        return (None, 400)
+    # push the whole thing into cache
+    return SiteVars

-    finalurl = list(urlparse(r.url))
-    finalurl[4] = '&'.join(
-        [x for x in finalurl[4].split('&') if not x.startswith('utm_')])
-    finalurl = urlunparse(finalurl)

-    return (finalurl, r.status_code)
+ARROWFORMAT = {
+    'iso': 'YYYY-MM-DDTHH:mm:ssZ',
+    'display': 'YYYY-MM-DD HH:mm'
+}

-def find_archiveorgurl(url):
-    url, status = find_realurl(url)
-    if status == requests.codes.ok:
-        return url
+LLEVEL = {
+    'critical': 50,
+    'error': 40,
+    'warning': 30,
+    'info': 20,
+    'debug': 10
+}

-    try:
-        a = requests.get(
-            "http://archive.org/wayback/available?url=%s" % url,
-        )
-    except Exception as e:
-        logging.error('Failed to fetch archive.org availability for %s' % url)
-        return None
+REGEX = {
+    'exifdate': re.compile(
+        r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+'
+        r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$'
+    ),
+    'cleanurl': re.compile(r"^https?://(?:www)?"),
+    'urls': re.compile(
+        r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+'
+        r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*'
+    ),
+    'mdimg': re.compile(
+        r'(?P<shortcode>\!\[(?P<alt>[^\]]+)\]\((?P<fname>[^\s]+)'
+        r'(?:\s[\'\"](?P<title>[^\"\']+)[\'\"])?\)(?:\{(?P<css>[^\}]+)\})?)',
+        re.IGNORECASE
+    )
+}

-    if not a:
-        logging.error('empty archive.org availability for %s' % url)
-        return None
+config = __expandconfig()

-    try:
-        a = json.loads(a.text)
-        aurl = a.get(
-            'archived_snapshots', {}
-        ).get(
-            'closest', {}
-        ).get(
-            'url', None
-        )
-        if aurl:
-            logging.debug("found %s in archive.org for %s", aurl, url)
-            return aurl
-    except Exception as e:
-        logging.error("archive.org parsing failed: %s", e)
+j2 = jinja2.Environment(
+    loader=jinja2.FileSystemLoader(
+        searchpath=config.get('dirs', 'tmpl')
+    ),
+    lstrip_blocks=True
+)

-    return None
+site = __setup_sitevars()