From 4a699ef9f5179d403f7864010e9c7cf5631f2719 Mon Sep 17 00:00:00 2001 From: Peter Molnar Date: Fri, 27 Oct 2017 10:29:33 +0100 Subject: [PATCH] 2.0-alpha1: tags dropped, favs dropped, bookmarks dropped, reposts dropped, better async rendering; TODO comments, websub pings, webmentions --- README.md | 168 ++- archive.py | 71 +- db.py | 234 ++++ nasg.py | 2726 ++++++++++++++-------------------------------- requirements.txt | 28 +- router.py | 86 ++ shared.py | 416 +++---- 7 files changed, 1495 insertions(+), 2234 deletions(-) create mode 100644 db.py mode change 100755 => 100644 nasg.py create mode 100644 router.py diff --git a/README.md b/README.md index ab4d41e..692e990 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,166 @@ -# NASG: Not Another Statig Generator... +# NASG (Not Another Statig Generator) -So I ended up writing my static generator and this is (most) of the code for it. +This is a tiny static site generator, written in Python, to scratch my own itches. +It is most probably not suitable for anyone else. -Don't expect anything fancy and please be aware that my Python Fu has much to learn. +## Why not [insert static generator here]? -I've written about the generic ideas and approaches here in my -[Going Static](https://petermolnar.net/going-static/) entry. +- DRY -Don't Repeat Yourself - is good, so instead of sidefiles for images, I'm using XMP metadata, which most of the ones availabe don't handle well; +- writing a proper plugin to existing generators - Pelican, Nicola, etc - might have taken longer and I wanted to extend my Python knowledge +- I wanted to use the best available utilities for some tasks, like `Pandoc` and `exiftool` instead of Python libraries trying to achive the same +- I needed to handle webmentions and comments + +Don't expect anything fancy: my Python Fu has much to learn. + +## How content is organized + +The directory structure of the "source" is something like this: +``` +├── content +│   ├── category1 (containing YAML + MD files) +│   ├── category2 (containing YAML + MD files) +│   ├── photo (containing jpg files) +│   ├── _category_excluded_from_listing_1 (containing YAML + MD files) + +├── files +│   ├── image (my own pictures) +│   ├── photo -> ../content/photo +│   └── pic (random images) +├── nasg +│   ├── archive.py +│   ├── config.ini +│   ├── db.py +│   ├── LICENSE +│   ├── nasg.py +│   ├── README.md +│   ├── requirements.txt +│   ├── router.py +│   ├── shared.py +│   └── templates +├── static +│   ├── favicon.ico +│   ├── favicon.png +│   └── pgp.asc +└── var + ├── gone.tsv + ├── redirects.tsv + ├── s.sqlite + ├── tokens.json + └── webmention.sqlite +``` + +Content files can be in either YAML and Markdown, with `.md` extension, or JPG with metadata, with `.jpg` extension. + +Inline images in the content are checked against all subdirectories in `files` ; they get their EXIF read and displayed as well if they match the regex in the configuration for the Artist and/or Copyright EXIF fields. + +`gone.tsv` is a simple list of URIs that should return a `410 Gone` message while `redirect.tsv` is a tab separated file of `from to` entries that should be `301` redirected. These go into a magic.php file, so if the host supports executing PHP, it will take care of this. + +## Output + +`nasg.py` generates a `build` directory which will have an directory per entry, with an `index.html`, so urls can be `https://domain.com/filename/`. + +Categories are rendered into `category/category_name`. Pagination is under `category/category_name/page/X`. They include a feed as well, `category/category_name/feed`, in form if an `index.atom` ATOM feed. + +## Webserver configuration + +A minimal nginx configuration for the virtualhost: +``` +# --- Virtual Host --- +upstream {{ domain }} { + server unix:/var/run/php/{{ domain }}.sock; +} + +server { + listen 80; + server_name .{{ domain }}; + rewrite ^ https://$server_name$request_uri redirect; + access_log /dev/null; + error_log /dev/null; +} + +server { + listen 443 ssl http2; + server_name .{{ domain }}; + ssl_certificate /etc/letsencrypt/live/{{ domain }}/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/{{ domain }}/privkey.pem; + ssl_dhparam dh.pem; + add_header X-Frame-Options "SAMEORIGIN"; + add_header X-Content-Type-Options "nosniff"; + add_header X-XSS-Protection "1; mode=block"; + add_header Strict-Transport-Security "max-age=31536000; includeSubdomains;"; + + root /[path to root]/{{ domain }}; + + location = /favicon.ico { + log_not_found off; + access_log off; + } + + location = /robots.txt { + log_not_found off; + access_log off; + } + + location ~ ^(?.+?\.php)(?.*)$ { + try_files $uri $script_name =404; + fastcgi_param SCRIPT_FILENAME $document_root$script_name; + fastcgi_param SCRIPT_NAME $script_name; + fastcgi_param PATH_INFO $path_info; + fastcgi_param PATH_TRANSLATED $document_root$path_info; + fastcgi_param QUERY_STRING $query_string; + fastcgi_param REQUEST_METHOD $request_method; + fastcgi_param CONTENT_TYPE $content_type; + fastcgi_param CONTENT_LENGTH $content_length; + fastcgi_param SCRIPT_NAME $script_name; + fastcgi_param REQUEST_URI $request_uri; + fastcgi_param DOCUMENT_URI $document_uri; + fastcgi_param DOCUMENT_ROOT $document_root; + fastcgi_param SERVER_PROTOCOL $server_protocol; + fastcgi_param GATEWAY_INTERFACE CGI/1.1; + fastcgi_param SERVER_SOFTWARE nginx; + fastcgi_param REMOTE_ADDR $remote_addr; + fastcgi_param REMOTE_PORT $remote_port; + fastcgi_param SERVER_ADDR $server_addr; + fastcgi_param SERVER_PORT $server_port; + fastcgi_param SERVER_NAME $server_name; + fastcgi_param HTTP_PROXY ""; + fastcgi_param HTTPS $https if_not_empty; + fastcgi_param SSL_PROTOCOL $ssl_protocol if_not_empty; + fastcgi_param SSL_CIPHER $ssl_cipher if_not_empty; + fastcgi_param SSL_SESSION_ID $ssl_session_id if_not_empty; + fastcgi_param SSL_CLIENT_VERIFY $ssl_client_verify if_not_empty; + fastcgi_param REDIRECT_STATUS 200; + fastcgi_index index.php; + fastcgi_connect_timeout 10; + fastcgi_send_timeout 360; + fastcgi_read_timeout 3600; + fastcgi_buffer_size 512k; + fastcgi_buffers 512 512k; + fastcgi_keep_conn on; + fastcgi_intercept_errors on; + fastcgi_split_path_info ^(?.+?\.php)(?.*)$; + fastcgi_pass {{ domain }}; + } + + location / { + try_files $uri $uri/ $uri.html $uri/index.html $uri/index.xml $uri/index.atom index.php @rewrites; + } + + location @rewrites { + rewrite ^ /magic.php?$args last; + } + + location ~* \.(css|js|eot|woff|ttf|woff2)$ { + expires 1d; + add_header Cache-Control "public, must-revalidate, proxy-revalidate"; + add_header "Vary" "Accept-Encoding"; + } + + location ~* \.(png|ico|gif|svg|jpg|jpeg|webp|avi|mpg|mpeg|mp4|mp3)$ { + expires 7d; + add_header Cache-Control "public, must-revalidate, proxy-revalidate"; + add_header "Vary" "Accept-Encoding"; + } +} + +``` diff --git a/archive.py b/archive.py index c159929..9193a78 100644 --- a/archive.py +++ b/archive.py @@ -5,14 +5,16 @@ import glob import logging import shutil import subprocess +import imghdr import arrow +from pprint import pprint + from requests_oauthlib import OAuth1Session, oauth1_session, OAuth2Session, oauth2_session from oauthlib.oauth2 import BackendApplicationClient - +import db import shared - class Favs(object): def __init__(self, confgroup): self.confgroup = confgroup @@ -101,6 +103,7 @@ class FlickrFavs(Favs): fav = FlickrFav(photo) if not fav.exists: fav.run() + #fav.fix_extension() class FivehpxFavs(Favs): def __init__(self): @@ -179,6 +182,7 @@ class FivehpxFavs(Favs): fav = FivehpxFav(photo) if not fav.exists: fav.run() + #fav.fix_extension() class TumblrFavs(Favs): @@ -242,7 +246,7 @@ class DAFavs(Favs): 'https://www.deviantart.com/api/v1/oauth2/collections/folders', params={ 'username': self.username, - 'calculate_size': 'false', + 'calculate_size': 'true', 'ext_preload': 'false', 'mature_content': 'true' } @@ -304,29 +308,29 @@ class DAFavs(Favs): has_more = self.has_more(js.get('has_more')) offset = js.get('next_offset') while True == has_more: - logging.info('iterating over DA results with offset %d', offset) + #logging.info('iterating over DA results with offset %d', offset) paged = self.getpaged(offset) new = paged.get('results', []) if not len(new): #logging.error('empty results from deviantART, breaking loop') break - favs = favs + new + favs = [*favs, *new] has_more = self.has_more(paged.get('has_more')) if not has_more: break n = int(paged.get('next_offset')) if not n: break - offset = offset + n + offset = n self.favs = favs for fav in self.favs: f = DAFav(fav) - if f.exists: - continue + if not f.exists: + f.fav.update({'meta': self.getsinglemeta(fav.get('deviationid'))}) + f.run() + #f.fix_extension() - f.fav.update({'meta': self.getsinglemeta(fav.get('deviationid'))}) - f.run() class ImgFav(object): def __init__(self): @@ -349,7 +353,19 @@ class ImgFav(object): @property def exists(self): - return os.path.exists(self.target) + maybe = glob.glob(self.target.replace('.jpg', '.*')) + if len(maybe): + return True + return False + + def fix_extension(self): + # identify file format + what = imghdr.what(self.target) + # rename file + new = self.target.replace('.jpg', '.%s' % what) + if new != self.target: + shutil.move(self.target, new) + self.target = new def pull_image(self): logging.info("pulling image %s to %s", self.imgurl, self.target) @@ -359,8 +375,11 @@ class ImgFav(object): r.raw.decode_content = True shutil.copyfileobj(r.raw, f) - def write_exif(self): + what = imghdr.what(self.target) + if 'jpg' != what or 'png' != what: + return + logging.info('populating EXIF data of %s' % self.target) tags = list(set(self.meta.get('tags',[]))) dt = self.meta.get('dt').to('utc') @@ -387,7 +406,7 @@ class ImgFav(object): params = [ 'exiftool', '-overwrite_original', - '-EXIF:Artist=%s' % author_name[:64], + #'-EXIF:Artist=%s' % author_name[:64], '-XMP:Copyright=Copyright %s %s (%s)' % ( dt.format('YYYY'), author_name, @@ -501,6 +520,7 @@ class FlickrFav(ImgFav): self.photo.get('description', {}).get('_content', '') ) + self.fix_extension() self.write_exif() class FivehpxFav(ImgFav): @@ -546,12 +566,14 @@ class FivehpxFav(ImgFav): } c = "%s" % self.photo.get('description', '') self.content = shared.Pandoc('plain').convert(c) + self.fix_extension() self.write_exif() class DAFav(ImgFav): def __init__(self, fav): self.fav = fav self.deviationid = fav.get('deviationid') + #logging.info('working on %s', self.deviationid) self.url = fav.get('url') self.title = fav.get('title', False) or self.deviationid self.author = self.fav.get('author').get('username') @@ -562,9 +584,21 @@ class DAFav(ImgFav): shared.slugfname(self.author) ) ) + + self.imgurl = None + if 'content' in fav: + if 'src' in fav['content']: + self.imgurl = fav.get('content').get('src') + elif 'preview' in fav: + if 'src' in fav['preview']: + self.imgurl = fav.get('preview').get('src') self.imgurl = fav.get('content', {}).get('src') def run(self): + if not self.imgurl: + logging.error('imgurl is empty for deviantart %s', self.deviationid) + return + self.pull_image() self.meta = { @@ -583,6 +617,7 @@ class DAFav(ImgFav): } c = "%s" % self.fav.get('meta', {}).get('description', '') self.content = shared.Pandoc('plain').convert(c) + self.fix_extension() self.write_exif() @@ -600,7 +635,10 @@ class TumblrFav(object): @property def exists(self): - return os.path.exists(self.target.replace('.jpg', '_0.jpg')) + maybe = glob.glob(self.target.replace('.jpg', '_0.*')) + if len(maybe): + return True + return False def run(self): content = "%s" % self.like.get('caption', '') @@ -635,6 +673,7 @@ class TumblrFav(object): img.content = content img.meta = meta img.pull_image() + img.fix_extension() img.write_exif() icntr = icntr + 1 @@ -681,7 +720,7 @@ class Oauth1Flow(object): self.service = service self.key = shared.config.get("api_%s" % service, 'api_key') self.secret = shared.config.get("api_%s" % service, 'api_secret') - self.tokendb = shared.TokenDB() + self.tokendb = db.TokenDB() self.t = self.tokendb.get_service(self.service) self.oauth_init() @@ -796,7 +835,7 @@ class TumblrOauth(Oauth1Flow): if __name__ == '__main__': - logging.basicConfig(level=10) + logging.basicConfig(level=20) flickr = FlickrFavs() flickr.run() diff --git a/db.py b/db.py new file mode 100644 index 0000000..65f1932 --- /dev/null +++ b/db.py @@ -0,0 +1,234 @@ +import os +import json +import sqlite3 +import glob +import shared + +# TODO sqlite3 cache instead of filesystem ? + +class TokenDB(object): + def __init__(self, uuid='tokens'): + self.db = shared.config.get('var', 'tokendb') + self.tokens = {} + self.refresh() + + def refresh(self): + self.tokens = {} + if os.path.isfile(self.db): + with open(self.db, 'rt') as f: + self.tokens = json.loads(f.read()) + + def save(self): + with open(self.db, 'wt') as f: + f.write(json.dumps( + self.tokens, indent=4, sort_keys=True + )) + + def get_token(self, token): + return self.tokens.get(token, None) + + def get_service(self, service): + token = self.tokens.get(service, None) + return token + + def set_service(self, service, tokenid): + self.tokens.update({ + service: tokenid + }) + self.save() + + def update_token(self, + token, + oauth_token_secret=None, + access_token=None, + access_token_secret=None, + verifier=None): + + t = self.tokens.get(token, {}) + if oauth_token_secret: + t.update({ + 'oauth_token_secret': oauth_token_secret + }) + if access_token: + t.update({ + 'access_token': access_token + }) + if access_token_secret: + t.update({ + 'access_token_secret': access_token_secret + }) + if verifier: + t.update({ + 'verifier': verifier + }) + + self.tokens.update({ + token: t + }) + self.save() + + def clear(self): + self.tokens = {} + self.save() + + def clear_service(self, service): + t = self.tokens.get(service) + if t: + del(self.tokens[t]) + del(self.tokens[service]) + self.save() + +class SearchDB(object): + tmplfile = 'Search.html' + + def __init__(self): + self.db = sqlite3.connect( + "%s" % shared.config.get('var', 'searchdb') + ) + + cursor = self.db.cursor() + cursor.execute('''CREATE VIRTUAL TABLE IF NOT EXISTS data USING FTS5( + id, + corpus, + mtime, + url, + category, + title + )''') + self.db.commit() + + def __exit__(self): + self.finish() + + def finish(self): + self.db.close() + + def append(self, id, corpus, mtime, url, category, title): + mtime = int(mtime) + cursor = self.db.cursor() + cursor.execute('''UPDATE data SET corpus=?, mtime=?, url=?, category=?, title=? WHERE id=?;''', ( + corpus, + mtime, + url, + category, + title, + id + )) + cursor.execute('''INSERT OR IGNORE INTO data (id, corpus, mtime, url, category, title) VALUES (?,?,?,?,?,?);''', ( + id, + corpus, + mtime, + url, + category, + title + )) + self.db.commit() + + def is_uptodate(self, fname, mtime): + ret = {} + cursor = self.db.cursor() + cursor.execute('''SELECT mtime + FROM data + WHERE id = ? AND mtime = ?''', + (fname,mtime) + ) + rows = cursor.fetchall() + if len(rows): + return True + return False + + def search_by_query(self, query): + ret = {} + cursor = self.db.cursor() + cursor.execute('''SELECT + id, category, url, title, highlight(data, 0, '', '') corpus + FROM data + WHERE data MATCH ? + ORDER BY category, rank;''', (query,)) + rows = cursor.fetchall() + for r in rows: + r = { + 'id': r[0], + 'category': r[1], + 'url': r[2], + 'title': r[3], + 'txt': r[4], + } + + category = r.get('category') + if category not in ret: + ret.update({category: {}}) + + + maybe_fpath = os.path.join( + shared.config.get('dirs', 'content'), + category, + "%s.*" % r.get('id') + ) + #fpath = glob.glob(maybe_fpath).pop() + ret.get(category).update({ + r.get('id'): { + #'fpath': fpath, + 'url': r.get('url'), + 'title': r.get('title'), + 'txt': r.get('txt') + } + }) + return ret + + + def cli(self, query): + results = self.search_by_query(query) + for c, items in sorted(results.items()): + print("%s:" % c) + for fname, data in sorted(items.items()): + print(" %s" % data.get('fpath')) + print(" %s" % data.get('url')) + print("") + + def html(self, query): + tmplvars = { + 'results': self.search_by_query(query), + 'term': query + } + return shared.j2.get_template(self.tmplfile).render(tmplvars) + + +class WebmentionQueue(object): + def __init__(self): + self.db = sqlite3.connect( + "%s" % shared.config.get('var', 'webmentiondb') + ) + + cursor = self.db.cursor() + cursor.execute('''CREATE TABLE IF NOT EXISTS `archive` ( + `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, + `received` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + `processed` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + `source` TEXT NOT NULL, + `target` TEXT NOT NULL + );'''); + + cursor.execute('''CREATE TABLE IF NOT EXISTS `queue` ( + `id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, + `timestamp` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + `source` TEXT NOT NULL, + `target` TEXT NOT NULL + );'''); + self.db.commit() + + def __exit__(self): + self.finish() + + def finish(self): + self.db.close() + + def queue(self, source, target): + cursor = self.db.cursor() + cursor.execute( + '''INSERT INTO queue (source,target) VALUES (?,?);''', ( + source, + target + ) + ) + self.db.commit() diff --git a/nasg.py b/nasg.py old mode 100755 new mode 100644 index d932b79..e91810b --- a/nasg.py +++ b/nasg.py @@ -2,79 +2,117 @@ import os import re -import configparser -import argparse -import shutil import logging +import configparser import json import glob -import tempfile -import atexit -import re -import hashlib -import math +import argparse +import shutil +from urllib.parse import urlparse import asyncio +from math import ceil import csv -import getpass -#import quopri -#import base64 -#import mimetypes -import copy +import sqlite3 -import magic -import arrow -import wand.image -#import similar_text import frontmatter -from slugify import slugify +import arrow import langdetect -import requests -from whoosh import index -from whoosh import qparser -import jinja2 -import urllib.parse -from webmentiontools.send import WebmentionSend -import bleach -from emoji import UNICODE_EMOJI -#from bs4 import BeautifulSoup -#from readability.readability import Document +import wand.image + import shared -#import oauth +import db -def splitpath(path): - parts = [] - (path, tail) = os.path.split(path) - while path and tail: - parts.insert(0,tail) - (path,tail) = os.path.split(path) - return parts +from pprint import pprint + +class MagicPHP(object): + name = 'magic.php' + + def __init__(self): + # init 'gone 410' array + self.gones = [] + f = shared.config.get('var', 'gone') + if os.path.isfile(f): + with open(f) as csvfile: + reader = csv.reader(csvfile, delimiter=' ') + for row in reader: + self.gones.append(row[0]) + # init manual redirects array + self.redirects = [] + f = shared.config.get('var', 'redirects') + if os.path.isfile(f): + with open(f) as csvfile: + reader = csv.reader(csvfile, delimiter=' ') + for row in reader: + self.redirects.append((row[0], row[1])) + + @property + def phpfile(self): + return os.path.join( + shared.config.get('common', 'build'), + self.name + ) + + async def render(self): + logging.info('saving %s' % (self.name)) + o = self.phpfile + tmplfile = "%s.html" % (__class__.__name__) + r = shared.j2.get_template(tmplfile).render({ + 'site': shared.site, + 'redirects': self.redirects, + 'gones': self.gones + }) + with open(o, 'wt') as out: + logging.debug('writing file %s' % (o)) + out.write(r) -class BaseIter(object): +class NoDupeContainer(object): + """ Base class to hold keys => data dicts with errors on dupes """ def __init__(self): self.data = {} + self.default = None def append(self, key, value): - if key in self.data: - logging.warning("duplicate key: %s, using existing instead", key) - existing = self.data.get(key) - if hasattr(value, 'fname') and hasattr(existing, 'fname'): - logging.warning( - "%s collides with existing %s", - value.fname, - existing.fname - ) + # all clear + if key not in self.data: + self.data.update({key: value}) return - self.data[key] = value + # problem + logging.error( + "duplicate key error when populating %s: %s", + self.__class__.__name__, + key + ) + logging.error( + "current: %s", + self.data.get(key) + ) + logging.error( + "problem: %s", + value + ) + + return + + # TODO: return ordered version of data def __getitem__(self, key): - return self.data.get(key, {}) + return self.data.get(key, self.default) + #def __delitem__(self, key): + #return del(self.data[key]) - def __repr__(self): - return json.dumps(list(self.data.values())) + def __setitem__(self, key, value): + return self.append(key, value) + def __contains__(self, key): + if key in self.data.keys(): + return True + return False + + def __len__(self): + return len(self.data.keys()) def __next__(self): try: @@ -83,700 +121,641 @@ class BaseIter(object): raise StopIteration() return r - def __iter__(self): for k, v in self.data.items(): yield (k, v) return + #def __repr__(self): + #return json.dumps(self.data) -class BaseRenderable(object): + #def __str__(self): + #return "iteration container with %s items" % (len(self.data.keys())) + + +class FContainer(NoDupeContainer): + """ This is a container that holds a lists of files based on Container so it errors on duplicate slugs and is popolated with recorsive glob """ + def __init__(self, dirs=[''], extensions=['*']): + super().__init__() + files = [] + for ext in extensions: + for p in dirs: + files.extend(glob.iglob( + os.path.join(p,'*.%s' % (ext)), + recursive=True + )) + # eliminate duplicates + files = list(set(files)) + for fpath in files: + fname = os.path.basename(fpath) + self.append(fname, fpath) + +class Content(FContainer): + """ This is a container that holds markdown files that are parsed when the container is populated on the fly; based on FContainer which is a Container """ def __init__(self): - return + dirs=[os.path.join(shared.config.get('dirs', 'content'), "**")] + extensions=['md', 'jpg'] + super().__init__(dirs, extensions) + for fname, fpath in self.data.items(): + self.data.update({fname: Singular(fpath)}) +class Category(NoDupeContainer): + """ A Category which holds pubtime (int) => Singular data """ + indexfile = 'index.html' + feedfile = 'index.atom' + feeddir = 'feed' + pagedir = 'page' + taxonomy = 'category' - def writerendered(self, content, mtime=None): - mtime = mtime or self.mtime - d = os.path.dirname(self.target) - if not os.path.isdir(d): - os.mkdir(d) + def __init__(self, name=''): + self.name = name + super().__init__() - with open(self.target, "w") as html: - logging.debug('writing %s', self.target) - html.write(content) - html.close() - os.utime(self.target, (mtime, mtime)) + def append(self, post): + return super().append(post.pubtime, post) + @property + def mtime(self): + return int(sorted(self.data.keys(), reverse=True)[0]) -class Indexer(object): - def __init__(self): - self.target = os.path.abspath(os.path.join( - shared.config.get('target', 'builddir'), - shared.config.get('var', 'searchdb') - )) - - if not os.path.isdir(self.target): - os.mkdir(self.target) - - self.mtime = 0 - - if index.exists_in(self.target): - self.ix = index.open_dir(self.target) - tocfiles = glob.glob(os.path.join( - self.target, - "_MAIN_*.toc" - )) - if len(tocfiles): - self.mtime = int(os.path.getmtime(tocfiles[0])) - else: - self.ix = index.create_in(self.target, shared.schema) - - self.writer = self.ix.writer() - self.qp = qparser.QueryParser("url", schema=shared.schema) - - - async def append(self, singular): - if singular.isfuture: - return - - logging.debug("searching for existing index for %s", singular.fname) - if self.mtime >= singular.mtime: - logging.debug("search index is newer than post mtime (%d vs %d), skipping post", self.mtime, singular.mtime) - return - - exists = False - q = self.qp.parse(singular.url) - r = self.ix.searcher().search(q, limit=1) - if r: - r = r[0] - # nothing to do, the entry is present and is up to date - ixtime = r['mtime'] - if int(ixtime) == int(singular.mtime): - logging.info("search index is up to date for %s", singular.fname) - return - else: - logging.info("search index is out of date: %d (indexed) vs %d", ixtime, singular.mtime) - exists = True - - reactions = [] - for t, v in singular.reactions.items(): - if isinstance(v, list) and len(v): - reactions = [*reactions, *v] - - content = "\n\n".join([ - "\n\n".join([ - bleach.clean(c, tags=[], strip_comments=True, strip=True) - for c in [singular.sumhtml, singular.html] - ]), - "\n".join(["%s" % c for c in singular.tags]), - "\n\n".join(reactions) - ]) - - weight = 1 - if singular.isbookmark or singular.isfav: - weight = 10 - if singular.ispage: - weight = 100 - - if not len(singular.title): - title = singular.fname - else: - title = singular.title - - if singular.photo: - img = shared.Pandoc().convert("%s" % singular.photo) - else: - img = '' - - args = { - 'url': singular.url, - 'category': singular.category, - 'date': singular.published.datetime, - 'title': title, - 'weight': weight, - 'img': img, - 'content': content, - 'fuzzy': content, - 'mtime': singular.mtime - } - - if exists: - logging.info("updating search index with %s", singular.fname) - self.writer.add_document(**args) - else: - logging.info("appending search index with %s", singular.fname) - self.writer.update_document(**args) - - - def finish(self): - self.writer.commit() - - -class Renderer(object): - def __init__(self): - self.sitevars = dict(shared.config.items('site')) - self.sitevars['author'] = dict(shared.config.items('author')) - self.sitevars['author']['socials'] = dict(shared.config.items('socials')) - self.sitevars['author']['qr'] = dict(shared.config.items('qr')) - - self.jinjaldr = jinja2.FileSystemLoader( - searchpath=shared.config.get('source', 'templatesdir') - ) - self.j2 = jinja2.Environment(loader=self.jinjaldr) - self.j2.filters['date'] = Renderer.jinja_filter_date - self.j2.filters['search'] = Renderer.jinja_filter_search - self.j2.filters['slugify'] = Renderer.jinja_filter_slugify - - - @staticmethod - def jinja_filter_date(d, form='%Y-%m-%d %H:%m:%S'): - if d == 'now': - d = arrow.now().datetime - if form == 'c': - return d.isoformat() - #form = '%Y-%m-%dT%H:%M:%S%z' - return d.strftime(form) - - - @staticmethod - def jinja_filter_slugify(s): - return slugify(s, only_ascii=True, lower=True) - - - @staticmethod - def jinja_filter_search(s, r): - if r in s: + @property + def is_uptodate(self): + index = os.path.join(self.path_paged(), self.indexfile) + if not os.path.isfile(index): + return False + mtime = os.path.getmtime(index) + if mtime == self.mtime: return True return False - -# based on http://stackoverflow.com/a/10075210 -class ExifTool(shared.CMDLine): - """ Handles calling external binary `exiftool` in an efficient way """ - sentinel = "{ready}\n" - - - def __init__(self): - super().__init__('exiftool') - - - def run(self, *filenames): - return json.loads(self.execute( - '-sort', - '-json', - '-MIMEType', - '-FileType', - '-FileName', - '-ModifyDate', - '-CreateDate', - '-DateTimeOriginal', - '-ImageHeight', - '-ImageWidth', - '-Aperture', - '-FOV', - '-ISO', - '-FocalLength', - '-FNumber', - '-FocalLengthIn35mmFormat', - '-ExposureTime', - '-Copyright', - '-Artist', - '-Model', - '-GPSLongitude#', - '-GPSLatitude#', - '-LensID', - *filenames)) - - -class Comment(BaseRenderable): - def __init__(self, path): - logging.debug("initiating comment object from %s", path) - self.path = path - self.fname, self.ext = os.path.splitext(os.path.basename(self.path)) - self.mtime = int(os.path.getmtime(self.path)) - self.meta = {} - self.content = '' - self.tmplfile = 'comment.html' - self.__parse() - - - def __repr__(self): - return "%s" % (self.path) - - - def __parse(self): - with open(self.path, mode='rt') as f: - self.meta, self.content = frontmatter.parse(f.read()) - @property - def reacji(self): - if hasattr(self, '_reacji'): - return self._reacji + def title(self): + # TODO proper title + return self.name - self._reacji = '' - maybe = bleach.clean( - self.content, - tags=[], - strip_comments=True, - strip=True, - ).strip() + def url_paged(self, page=1, feed=False): + x = '/' + if self.name: + x = "%s%s/%s" % ( + x, + self.taxonomy, + self.name, + ) - if not len(maybe): - self._reacji = '★' - elif maybe in UNICODE_EMOJI: - self._reacji = maybe + if page == 1 and feed: + x = "%s/%s/" % (x, self.feeddir) + else: + x = "%s/%s/%s/" % (x, self.pagedir, "%s" % page) + return x - #t = self.meta.get('type', 'webmention') - #typemap = { - #'like-of': '👍', - #'bookmark-of': '🔖', - #'favorite': '★', - #} + def path_paged(self, page=1, feed=False): + x = shared.config.get('common', 'build') - #if t in typemap.keys(): - #self._reacji = typemap[t] - #else: + if self.name: + x = os.path.join( + x, + self.taxonomy, + self.name, + ) + + if page == 1: + if feed: + x = os.path.join(x, self.feeddir) + else: + x = os.path.join(x, self.pagedir, "%s" % page) + + if not os.path.isdir(x): + os.makedirs(x) + return x + + + def write_html(self, path, content): + with open(path, 'wt') as out: + logging.debug('writing file %s' % (path)) + out.write(content) + os.utime(path, (self.mtime, self.mtime)) + + + async def render(self): + if self.is_uptodate: + return + + pagination = shared.config.getint('display', 'pagination') + pages = ceil(len(self.data) / pagination) + page = 1 + while page <= pages: + # list relevant post templates + start = int((page-1) * pagination) + end = int(start + pagination) + posttmpls = [ + self.data[k].tmplvars + for k in list(sorted( + self.data.keys(), + reverse=True + ))[start:end] + ] + # define data for template + tmplvars = { + 'taxonomy': { + 'title': self.title, + 'name': self.name, + 'page': page, + 'total': pages, + 'perpage': pagination, + 'lastmod': arrow.get(self.mtime).format(shared.ARROWFORMAT['iso']), + 'feed': self.url_paged(page=1, feed=True), + 'url': self.url_paged(page), + }, + 'site': shared.site, + 'posts': posttmpls, + } + # render HTML + dirname = self.path_paged(page) + o = os.path.join(dirname, self.indexfile) + logging.info("Rendering page %d/%d of category %s to %s", page, pages, self.name, o) + tmplfile = "%s.html" % (__class__.__name__) + r = shared.j2.get_template(tmplfile).render(tmplvars) + self.write_html(o, r) + # render feed + if 1 == page: + dirname = self.path_paged(page, feed=True) + o = os.path.join(dirname, self.feedfile) + logging.info("Rendering feed of category %s to %s", self.name, o) + tmplfile = "%s_%s.html" % (__class__.__name__, self.feeddir) + r = shared.j2.get_template(tmplfile).render(tmplvars) + self.write_html(o, r) + # inc. page counter + page = page+1 + + +class Singular(object): + indexfile = 'index.html' + + def __init__(self, fpath): + logging.debug("initiating singular object from %s", fpath) + self.fpath = fpath + self.mtime = os.path.getmtime(self.fpath) + self.fname, self.fext = os.path.splitext(os.path.basename(self.fpath)) + self.category = os.path.basename(os.path.dirname(self.fpath)) + self._images = NoDupeContainer() + + if '.md' == self.fext: + with open(self.fpath, mode='rt') as f: + self.fm = frontmatter.parse(f.read()) + self.meta, self.content = self.fm + self.photo = None + elif '.jpg' == self.fext: + self.photo = WebImage(self.fpath) + self.meta = self.photo.fm_meta + self.content = self.photo.fm_content + self.photo.inline = False + self.photo.cssclass = 'u-photo' - return self._reacji @property - def html(self): - if hasattr(self, '_html'): - return self._html - - tmp = shared.Pandoc().convert(self.content) - self._html = bleach.clean(tmp, strip=True) - return self._html - + def redirects(self): + r = self.meta.get('redirect', []) + r.append(self.shortslug) + return list(set(r)) @property - def tmplvars(self): - if hasattr(self, '_tmplvars'): - return self._tmplvars + def is_uptodate(self): + if not os.path.isfile(self.htmlfile): + return False + mtime = os.path.getmtime(self.htmlfile) + if mtime == self.mtime: + return True + return False - self._tmplvars = { - 'published': self.published.datetime, - 'author': self.meta.get('author', {}), - #'content': self.content, - #'html': self.html, - 'source': self.source, - 'target': self.targeturl, - 'type': self.meta.get('type', 'webmention'), - 'reacji': self.reacji, - 'fname': self.fname - } - return self._tmplvars + @property + def htmlfile(self): + return os.path.join( + shared.config.get('common', 'build'), + self.fname, + self.indexfile + ) + @property + def images(self): + if self.photo: + self._images.append(self.fname, self.photo) + # add inline images + for shortcode, alt, fname, title, css in self.inline_images: + # this does the appending automatically + im = self._find_image(fname) + + return self._images + + @property + def exif(self): + if not self.photo: + return {} + return self.photo.exif @property def published(self): - if hasattr(self, '_published'): - return self._published - self._published = arrow.get(self.meta.get('date', self.mtime)) - return self._published + return arrow.get(self.meta.get('published', self.mtime)) + @property + def updated(self): + u = self.meta.get('updated', False) + if u: + u = arrow.get(u) + return u @property def pubtime(self): return int(self.published.timestamp) + @property + def is_reply(self): + return self.meta.get('in-reply-to', False) @property - def source(self): - if hasattr(self, '_source'): - return self._source - s = self.meta.get('source', '') - domains = shared.config.get('site', 'domains').split(' ') - self._source = s - for d in domains: - if d in s: - self._source = '' - return self._source - + def is_future(self): + now = arrow.utcnow().timestamp + if self.pubtime > now: + return True + return False @property - def targeturl(self): - if hasattr(self, '_targeturl'): - return self._targeturl - t = self.meta.get('target', shared.config.get('site', 'url')) - self._targeturl = '{p.path}'.format(p=urllib.parse.urlparse(t)).strip('/') - return self._targeturl - - @property - def target(self): - if hasattr(self, '_target'): - return self._target - - targetdir = os.path.abspath(os.path.join( - shared.config.get('target', 'builddir'), - shared.config.get('site', 'commentspath'), - self.fname - )) - - self._target = os.path.join(targetdir, 'index.html') - return self._target - - - async def render(self, renderer): - logging.info("rendering and saving comment %s", self.fname) - - if not shared.config.getboolean('params', 'force') and os.path.isfile(self.target): - ttime = int(os.path.getmtime(self.target)) - logging.debug('ttime is %d mtime is %d', ttime, self.mtime) - if ttime == self.mtime: - logging.debug( - '%s exists and up-to-date (lastmod: %d)', - self.target, - ttime - ) - return - - tmplvars = { - 'reply': self.tmplvars, - 'site': renderer.sitevars, - 'taxonomy': {}, + def licence(self): + l = shared.config.get('licence', self.category, + fallback=shared.config.get('licence', 'default',)) + return { + 'text': 'CC %s 4.0' % l.upper(), + 'url': 'https://creativecommons.org/licenses/%s/4.0/' % l, } - r = renderer.j2.get_template(self.tmplfile).render(tmplvars) - self.writerendered(r) + @property + def corpus(self): + corpus = "\n".join([ + "%s" % self.meta.get('title', ''), + "%s" % self.fname, + "%s" % self.meta.get('summary', ''), + "%s" % self.content, + ]) -class Comments(object): - def __init__(self): - self.files = glob.glob(os.path.join( - shared.config.get('source', 'commentsdir'), - "*.md" - )) - self.bytarget = {} + if self.photo: + corpus = corpus + "\n".join(self.meta.get('tags', [])) + return corpus - def __getitem__(self, key): - return self.bytarget.get(key, BaseIter()) + @property + def lang(self): + # default is English, this will only be changed if the try + # succeeds and actually detects a language + lang = 'en' + try: + lang = langdetect.detect("\n".join([ + self.fname, + self.meta.get('title', ''), + self.content + ])) + except: + pass + return lang - - def populate(self): - for fpath in self.files: - item = Comment(fpath) - t = item.targeturl - if not self.bytarget.get(t): - self.bytarget[t] = BaseIter() - self.bytarget[t].append(item.pubtime, item) - - -class Images(BaseIter): - def __init__(self, extensions=['jpg', 'gif', 'png']): - super(Images, self).__init__() - logging.info( - "initiating images with extensions: %s", - extensions - ) - self.files = [] - self.data = {} - # if anyone knows how to do this in a more pythonic way, please tell me - paths = [ - shared.config.get('source', 'filesdir'), - shared.config.get('source', 'photosdir') - ] - for p in paths: - for ext in extensions: - self.files += glob.glob(os.path.join(p, "*.%s" % ext)) - - - def populate(self): - with ExifTool() as e: - _meta = e.run(*self.files) - # parsing the returned meta into a dict of [filename]={meta} - for e in _meta: - if 'FileName' not in e: - logging.error("missing 'FileName' in element %s", e) - continue - fname = os.path.basename(e['FileName']) - del(e['FileName']) - # duplicate files are going to be a problem, so don't send it - # away with a simple error log entry - if fname in self.data: - raise ValueError('filename collision: %s', fname) - # convert dates - for k, v in e.items(): - e[k] = self.exifdate(v) - - self.data[fname] = WebImage(fname, e) - - - def exifdate(self, value): - """ converts and EXIF date string to ISO 8601 format - - :param value: EXIF date (2016:05:01 00:08:24) - :type arg1: str - :return: ISO 8601 string with UTC timezone 2016-05-01T00:08:24+0000 - :rtype: str - """ - if not isinstance(value, str): - return value - match = shared.EXIFREXEG.match(value) - if not match: - return value - return "%s-%s-%sT%s+0000" % ( - match.group('year'), - match.group('month'), - match.group('day'), - match.group('time') - ) - - -class WebImage(object): - def __init__(self, fname, meta): - logging.info( - "parsing image: %s", + def _find_image(self, fname): + pattern = os.path.join( + shared.config.get('dirs', 'files'), + '*', fname ) - self.meta = meta - self.fpath = os.path.abspath(meta.get('SourceFile', fname)) - self.fname, self.ext = os.path.splitext(fname) - self.alttext = '' - self.sizes = [] - self.fallbacksize = int(shared.config.get('common','fallbackimg', fallback='720')) - self.cl = '' - self.singleimage = False + logging.debug('trying to locate image %s in %s', fname, pattern) + maybe = glob.glob(pattern) - for size in shared.config.options('downsize'): - sizeext = shared.config.get('downsize', size) - fname = "%s_%s%s" % (self.fname, sizeext, self.ext) - self.sizes.append(( - int(size), - { - 'fpath': os.path.join( - shared.config.get('target', 'filesdir'), - fname - ), - 'url': "/%s/%s" % ( - #'url': "%s/%s/%s" % ( - #shared.config.get('site', 'url'), - shared.config.get('source', 'files'), - fname - ), - 'crop': shared.config.getboolean('crop', size, fallback=False), - } - )) + if not maybe: + return None - self.sizes = sorted(self.sizes, reverse=False) + if fname not in self._images: + im = WebImage(maybe.pop()) + self._images.append(fname,im) + return self._images[fname] - self.target = False - if self.is_downsizeable: - self.fallback = [e for e in self.sizes if e[0] == self.fallbacksize][0][1]['url'] - self.small = [e for e in self.sizes if e[1]['crop'] == False][0][1]['url'] - self.target = self.sizes[-1][1]['url'] - else: - self.small = self.fallback = "/%s/%s" % ( - #self.small = self.fallback = "%s/%s/%s" % ( - #shared.config.get('site', 'url'), - shared.config.get('source', 'files'), - "%s%s" % (self.fname, self.ext) - ) + @property + def inline_images(self): + return shared.REGEX['mdimg'].findall(self.content) - def _setupvars(self): - if self.is_downsizeable: - if self.singleimage and not self.cl: - self.cl = '.u-photo' - elif self.singleimage: - self.cl = '.u-photo %s' % self.cl - else: - if not self.cl: - self.cl = '.aligncenter' + @property + def url(self): + return "%s/%s" % (shared.config.get('site', 'url'), self.fname) + + @property + def body(self): + body = "%s" % (self.content) + # get inline images, downsize them and convert them to figures + for shortcode, alt, fname, title, css in self.inline_images: + fname = os.path.basename(fname) + im = self._find_image(fname) + if not im: + continue + + im.alt = alt + im.title = title + im.cssclass = css + body = body.replace(shortcode, str(im)) + + # TODO if multiple meta images, inline all except the first + # which will be added at the HTML stage or as enclosure to the feed + return body + + @property + def html(self): + html = "%s" % (self.body) + + # add photo + if self.photo: + html = "%s\n%s" % (str(self.photo), html) + + return shared.Pandoc().convert(html) + + @property + def title(self): + maybe = self.meta.get('title', False) + if maybe: + return maybe + if self.is_reply: + return "RE: %s" % self.is_reply + return self.published.format(shared.ARROWFORMAT['display']) + + @property + def summary(self): + s = self.meta.get('summary', '') + if not s: + return s + return shared.Pandoc().convert(s) + + @property + def shortslug(self): + return shared.baseN(self.pubtime) @property def tmplvars(self): - self._setupvars() - if hasattr(self, '_tmplvars'): - return self._tmplvars - - self._tmplvars = { - 'alttext': self.alttext, - 'fallback': self.fallback, - 'small': self.small, - 'title': "%s%s" % (self.fname, self.ext), - 'target': self.target, - 'cl': " ".join([s[1:] for s in self.cl.split()]), - 'orientation': self.orientation - } + # very simple caching because we might use this 4 times: + # post HTML, category, front posts and atom feed + if not hasattr(self, '_tmplvars'): + self._tmplvars = { + 'title': self.title, + 'pubtime': self.published.format(shared.ARROWFORMAT['iso']), + 'pubdate': self.published.format(shared.ARROWFORMAT['display']), + 'category': self.category, + 'html': self.html, + 'lang': self.lang, + 'slug': self.fname, + 'shortslug': self.shortslug, + 'licence': self.licence, + #'sourceurl': self.sourceurl, + 'is_reply': self.is_reply, + 'age': int(self.published.format('YYYY')) - int(arrow.utcnow().format('YYYY')), + 'summary': self.summary + } return self._tmplvars - def __str__(self): - self._setupvars() + async def render(self): + logging.info('rendering %s' % (self.fname)) + o = self.htmlfile + if self.is_uptodate: + logging.debug('%s is up to date' % (o)) + return + + tmplfile = "%s.html" % (__class__.__name__) + r = shared.j2.get_template(tmplfile).render({ + 'post': self.tmplvars, + 'site': shared.site, + }) + + d = os.path.dirname(o) + if not os.path.isdir(d): + logging.debug('creating directory %s' % (d)) + os.makedirs(d) + with open(o, 'wt') as out: + logging.debug('writing file %s' % (o)) + out.write(r) + os.utime(o, (self.mtime, self.mtime)) + + def __repr__(self): + return "%s/%s" % (self.category, self.fname) + + +class WebImage(object): + def __init__(self, fpath): + logging.info("parsing image: %s", fpath) + self.fpath = fpath + self.mtime = os.path.getmtime(self.fpath) + bname = os.path.basename(fpath) + self.fname, self.fext = os.path.splitext(bname) + self.title = '' + self.alt = bname + self.target = '' + self.cssclass = '' + + @property + def fm_content(self): + return self.meta.get('Description', '') + + @property + def fm_meta(self): + return { + 'published': self.meta.get('ReleaseDate', + self.meta.get('ModifyDate') + ), + 'title': self.meta.get('Headline', self.fname), + 'tags': list(set(self.meta.get('Subject', []))), + } + + @property + def href(self): + if len(self.target): + return self.target + + if not self.is_downsizeable: + return False + + return self.sizes[-1][1]['url'] + + @property + def src(self): + # is the image is too small to downsize, it will be copied over + # so the link needs to point at + src = "/%s/%s" % ( + shared.config.get('common', 'files'), + "%s%s" % (self.fname, self.fext) + ) if self.is_downsizeable: - #if self.singleimage and not self.cl: - #self.cl = '.u-photo' - #elif self.singleimage: - #self.cl = '.u-photo %s' % self.cl - - return '[![%s](%s "%s%s"){.adaptimg}](%s){.adaptive %s}' % ( - self.alttext, - self.fallback, - self.fname, - self.ext, - self.target, - self.cl - ) - else: - #if not self.cl: - #self.cl = '.aligncenter' - return '![%s](%s "%s%s"){%s}' % ( - self.alttext, - self.fallback, - self.fname, - self.ext, - self.cl - ) + try: + src = [e for e in self.sizes if e[0] == shared.config.getint('photo', 'default')][0][1]['url'] + except: + pass + return src + @property + def meta(self): + if not hasattr(self, '_exif'): + # reading EXIF is expensive enough even with a static generator + # to consider caching it, so I'll do that here + cpath = os.path.join( + shared.config.get('var', 'cache'), + "%s.exif.json" % self.fname + ) + + if os.path.exists(cpath): + cmtime = os.path.getmtime(cpath) + if cmtime >= self.mtime: + with open(cpath, 'rt') as f: + self._exif = json.loads(f.read()) + return self._exif + + self._exif = shared.ExifTool(self.fpath).read() + if not os.path.isdir(shared.config.get('var', 'cache')): + os.makedirs(shared.config.get('var', 'cache')) + with open(cpath, 'wt') as f: + f.write(json.dumps(self._exif)) + return self._exif + + @property + def is_photo(self): + # missing regex from config + if 'photo' not in shared.REGEX: + logging.debug('%s photo regex missing from config') + return False + + cpr = self.meta.get('Copyright', '') + art = self.meta.get('Artist', '') + + # both Artist and Copyright missing from EXIF + if not cpr and not art: + logging.debug('%s Artist or Copyright missing from EXIF') + return False + + # we have regex, Artist and Copyright, try matching them + pattern = re.compile(shared.config.get('photo', 'regex')) + if pattern.search(cpr) or pattern.search(art): + return True + + logging.debug('%s patterns did not match') + return False + @property def exif(self): - if not self.is_photo: - return {} - - if hasattr(self, '_exif'): - return self._exif - exif = {} + if not self.is_photo: + return exif + mapping = { - 'camera': [ - 'Model' - ], - 'aperture': [ - 'FNumber', - 'Aperture' - ], - 'shutter_speed': [ - 'ExposureTime' - ], - 'focallength35mm': [ - 'FocalLengthIn35mmFormat', - ], - 'focallength': [ - 'FocalLength', - ], - 'iso': [ - 'ISO' - ], - 'lens': [ - 'LensID', - ], - 'date': [ - 'CreateDate', - 'DateTimeOriginal', - ], - 'geo_latitude': [ - 'GPSLatitude' - ], - 'geo_longitude': [ - 'GPSLongitude' - ], + 'camera': ['Model'], + 'aperture': ['FNumber','Aperture'], + 'shutter_speed': ['ExposureTime'], + 'focallength': ['FocalLengthIn35mmFormat', 'FocalLength'], + 'iso': ['ISO'], + 'lens': ['LensID', 'LensSpec', 'Lens',], + #'date': ['CreateDate','DateTimeOriginal'], + 'geo_latitude': ['GPSLatitude'], + 'geo_longitude': ['GPSLongitude'], } for ekey, candidates in mapping.items(): for candidate in candidates: maybe = self.meta.get(candidate, None) - if maybe: - if 'geo_' in ekey: - exif[ekey] = round(float(maybe), 5) - else: - exif[ekey] = maybe - break - - self._exif = exif - return self._exif + if not maybe: + continue + elif 'geo_' in ekey: + exif[ekey] = round(float(maybe), 5) + else: + exif[ekey] = maybe + break + return exif @property - def orientation(self): - width = int(self.meta.get('ImageWidth', 0)) - height = int(self.meta.get('ImageHeight', 0)) + def sizes(self): + sizes = [] + _max = max( + int(self.meta.get('ImageWidth')), + int(self.meta.get('ImageHeight')) + ) - if width >= height: - return 'horizontal' - return 'vertical' + for size in shared.config.options('downsize'): + if _max < int(size): + continue - @property - def rssenclosure(self): - """ Returns the largest available image for RSS to add as attachment """ - if hasattr(self, '_rssenclosure'): - return self._rssenclosure + name = '%s_%s%s' % ( + self.fname, + shared.config.get('downsize', size), + self.fext + ) - target = self.sizes[-1][1] - self._rssenclosure = { - 'mime': magic.Magic(mime=True).from_file(target['fpath']), - 'url': target['url'], - 'size': os.path.getsize(target['fpath']), - 'fname': self.fname - } - return self._rssenclosure + fpath = os.path.join( + shared.config.get('common', 'build'), + shared.config.get('common', 'files'), + name + ) + exists = os.path.isfile(fpath) + # in case there is a downsized image compare against the main file's + # mtime and invalidate the existing if it's older + if exists: + mtime = os.path.getmtime(fpath) + if self.mtime > mtime: + exists = False - @property - def is_photo(self): - if hasattr(self, '_is_photo'): - return self._is_photo - - self._is_photo = False - #if not pattern or not isinstance(pattern, str): - # return False - pattern = re.compile(shared.config.get('photo', 'regex')) - - cpr = self.meta.get('Copyright', '') - art = self.meta.get('Artist', '') - if not cpr and not art: - return False - - if cpr and art: - if pattern.search(cpr) or pattern.search(art): - self._is_photo = True - - return self._is_photo - + sizes.append(( + int(size), + { + 'fpath': fpath, + 'exists': os.path.isfile(fpath), + 'url': "%s/%s/%s" % ( + shared.config.get('site', 'url'), + shared.config.get('common', 'files'), + name + ), + 'crop': shared.config.getboolean( + 'crop', + size, + fallback=False + ) + } + )) + return sorted(sizes, reverse=False) @property def is_downsizeable(self): - if hasattr(self, '_is_downsizeable'): - return self._is_downsizeable - - self._is_downsizeable = False - """ Check if the image is large enough and jpeg or png in order to - downsize it """ - fb = self.sizes[-1][0] + """ Check if the image is large enought to downsize it """ ftype = self.meta.get('FileType', None) if not ftype: - return self._is_downsizeable - if ftype.lower() == 'jpeg' or ftype.lower() == 'png': - width = int(self.meta.get('ImageWidth', 0)) - height = int(self.meta.get('ImageHeight', 0)) - if width > fb or height > fb: - self._is_downsizeable = True - - return self._is_downsizeable + return False + elif ftype.lower() != 'jpeg' and ftype.lower() != 'png': + return False - def _copy(self): - target = os.path.join( - shared.config.get('target', 'filesdir'), - "%s%s" % (self.fname, self.ext) + _max = max( + int(self.meta.get('ImageWidth')), + int(self.meta.get('ImageHeight')) ) - if not os.path.isfile(target): - logging.debug("can't downsize %s, copying instead" % self.fname) - shutil.copy(self.fpath, target) + _min = shared.config.getint('photo','default') + if _max > _min: + return True + return False - def _watermark(self, img): + def _maybe_watermark(self, img): """ Composite image by adding watermark file over it """ - wmarkfile = os.path.join( - shared.config.get('common', 'basedir'), - shared.config.get('common', 'watermark') - ) - if not os.path.isfile(wmarkfile): + + if not self.is_photo: + logging.debug("not watermarking: not a photo") return img + wmarkfile = shared.config.get('photo', 'watermark') + if not os.path.isfile(wmarkfile): + logging.debug("not watermarking: watermark not found") + return img + + logging.debug("%s is a photo, applying watermarking", self.fpath) with wand.image.Image(filename=wmarkfile) as wmark: if img.width > img.height: - w = img.width * 0.16 + w = img.width * 0.2 h = wmark.height * (w / wmark.width) x = img.width - w - (img.width * 0.01) y = img.height - h - (img.height * 0.01) @@ -795,13 +774,26 @@ class WebImage(object): if img.width <= img.height: wmark.rotate(-90) img.composite(image=wmark, left=x, top=y) + return img + def _copy(self): + fname = "%s%s" % (self.fname, self.fext) + logging.info("copying %s to build dir", fname) + fpath = os.path.join( + shared.config.get('common', 'build'), + shared.config.get('common', 'files'), + fname + ) + if os.path.isfile(fpath): + mtime = os.path.getmtime(fpath) + if self.mtime <= mtime: + return + shutil.copy(self.fpath, fpath) - def _intermediate_dimensions(self, size, width, height, crop = False): + def _intermediate_dimension(self, size, width, height, crop=False): + """ Calculate intermediate resize dimension and return a tuple of width, height """ size = int(size) - w = width - h = height if (width > height and not crop) \ or (width < height and crop): w = size @@ -811,14 +803,12 @@ class WebImage(object): w = int(float(size / height) * width) return (w, h) - - def _intermediate(self, img, size, meta, existing = []): + def _intermediate(self, img, size, target, crop=False): if img.width <= size and img.height <= size: return False - crop = meta.get('crop', False) with img.clone() as thumb: - width, height = self._intermediate_dimensions( + width, height = self._intermediate_dimension( size, img.width, img.height, @@ -840,1318 +830,200 @@ class WebImage(object): thumb.format = 'pjpeg' # this is to make sure pjpeg happens - with open(meta['fpath'], 'wb') as f: + with open(target, 'wb') as f: + logging.info("writing %s", target) thumb.save(file=f) - return True + @property + def needs_downsize(self): + needed = False + for (size, downsized) in self.sizes: + if downsized.get('exists', False): + logging.debug("size %d exists: %s", size, downsized.get('fpath')) + continue + logging.debug("size %d missing: %s", size, downsized.get('fpath')) + needed = True + return needed - - async def downsize(self, existing = []): + async def downsize(self): if not self.is_downsizeable: - self._copy() + return self._copy() + + if not self.needs_downsize and not shared.config.getboolean('params', 'regenerate'): return - logging.info("checking downsizing for %s", self.fname) - needed = shared.config.getboolean('params', 'regenerate', fallback=False) + build_files = os.path.join( + shared.config.get('common', 'build'), + shared.config.get('common', 'files'), + ) - if not needed: - for (size, meta) in self.sizes: - if meta['fpath'] not in existing: - needed = True - - if not needed: - logging.debug("downsizing not needed for %s", self.fname) - return + if not os.path.isdir(build_files): + os.makedirs(build_files) + logging.info("downsizing %s%s", self.fname, self.fext) with wand.image.Image(filename=self.fpath) as img: img.auto_orient() - - if self.is_photo: - logging.info("%s is a photo", self.fpath) - img = self._watermark(img) - - for (size, meta) in self.sizes: - self._intermediate(img, size, meta, existing) - - -class Taxonomy(BaseIter): - def __init__(self, name = None, taxonomy = None, slug = None): - super(Taxonomy, self).__init__() - self.name = name - if name and not slug: - self.slug = slugify(name, only_ascii=True, lower=True) - else: - self.slug = slug - self.taxonomy = taxonomy - - - #@property - #def pages(self): - #if hasattr(self, '_pages'): - #return self._pages - #self._pages = math.ceil(len(self.data) / shared.config.getint('common', 'pagination')) - #return self._pages - - def __repr__(self): - return "taxonomy %s with %d items" % (self.taxonomy, len(self.data)) - - - @property - def basep(self): - p = shared.config.get('target', 'builddir') - if self.taxonomy: - p = os.path.join(p, self.taxonomy) - if not os.path.isdir(p): - os.mkdir(p) - return p - - - @property - def myp(self): - p = self.basep - if self.slug: - p = os.path.join(p,self.slug) - if not os.path.isdir(p): - os.mkdir(p) - return p - - - @property - def feedp(self): - p = os.path.join(self.myp, 'feed') - if not os.path.isdir(p): - os.mkdir(p) - return p - - @property - def pagep(self): - p = os.path.join(self.myp, 'page') - if not os.path.isdir(p): - os.mkdir(p) - return p - - - @property - def baseurl(self): - if self.taxonomy and self.slug: - return "/%s/%s/" % (self.taxonomy, self.slug) - else: - return '/' - - - @property - def mtime(self): - if hasattr(self, '_mtime'): - return self._mtime - self._mtime = int(list(sorted(self.data.keys(), reverse=True))[0]) - return self._mtime - - - def __mkdirs(self): - check = [self.basep, self.myp, self.feedp] - for p in check: - if not os.path.isdir(p): - logging.debug("creating dir %s", p) - os.mkdir(p) - - def tpath(self, page): - if page == 1: - p = "%s" % (self.myp) - else: - p = os.path.join(self.pagep, "%d" % page) - - if not os.path.isdir(p): - logging.debug("creating dir %s", p) - os.mkdir(p) - - return os.path.join(p, "index.html") - - @property - def is_singlepage(self): - spcats = shared.config.get('common', 'onepagecategories').split(',') - if self.name in spcats and 'category' == self.taxonomy: - return True - return False - - def posttmpls(self, order='time', start=0, end=None): - end = end or len(self.data) - if 'all' == order: - return [ - i.tmplvars - for k, i in list(sorted( - self.data.items(), - key=lambda value: value[1].title.lower() - )) - ] - - return [ - self.data[k].tmplvars - for k in list(sorted( - self.data.keys(), - reverse=True - ))[start:end] - ] - - - async def render(self, renderer): - #if not self.slug or self.slug is 'None': - #return - - self.__mkdirs() - page = 1 - testpath = self.tpath(page) - if not shared.config.getboolean('params', 'force') and os.path.isfile(testpath): - ttime = int(os.path.getmtime(testpath)) - mtime = self.mtime - if ttime == mtime: - logging.info('taxonomy index for "%s" exists and up-to-date (lastmod: %d)', self.slug, ttime) - return - else: - logging.info('taxonomy update needed: %s timestamp is %d, last post timestamp is %d (%s)', - testpath, - ttime, - mtime, - self.data[mtime].fname + img = self._maybe_watermark(img) + for (size, downsized) in self.sizes: + self._intermediate( + img, + size, + downsized['fpath'], + downsized['crop'] ) - if self.is_singlepage: - pagination = len(self.data) - else: - pagination = shared.config.getint('common', 'pagination') - pages = math.ceil(len(self.data) / pagination) - - while page <= pages: - self.render_page(renderer, page, pagination, pages) - page = page+1 - - self.render_feeds(renderer) - self.ping_websub - - - def render_feeds(self, renderer): - pagination = shared.config.getint('common', 'pagination') - start = 0 - end = int(start + pagination) - posttmpls = self.posttmpls('time', start, end) - tmplvars = { - 'taxonomy': { - 'url': self.baseurl, - 'name': self.name, - 'slug': self.slug, - 'taxonomy': self.taxonomy, - 'lastmod': arrow.get(self.mtime).datetime - }, - 'site': renderer.sitevars, - 'posts': posttmpls, - } - - target = os.path.join(self.feedp, 'index.atom') - logging.info("rendering Atom feed to %s", target) - r = renderer.j2.get_template('atom.html').render(tmplvars) - with open(target, "wt") as html: - html.write(r) - os.utime(target, (self.mtime, self.mtime)) - - - def render_page(self, renderer, page, pagination, pages): - if self.is_singlepage: - posttmpls = self.posttmpls('all') - else: - start = int((page-1) * pagination) - end = int(start + pagination) - posttmpls = self.posttmpls('time', start, end) - - target = self.tpath(page) - tdir = os.path.dirname(target) - if not os.path.isdir(tdir): - logging.debug("creating dir %s", tdir) - os.mkdir(tdir) - - logging.info("rendering taxonomy page %d to %s", page, target) - tmplvars = { - 'taxonomy': { - 'url': self.baseurl, - 'name': self.name, - 'slug': self.slug, - 'taxonomy': self.taxonomy, - 'paged': page, - 'total': pages, - 'perpage': pagination, - 'lastmod': arrow.get(self.mtime).datetime - }, - 'site': renderer.sitevars, - 'posts': posttmpls, - } - - r = renderer.j2.get_template('archive.html').render(tmplvars) - with open(target, "wt") as html: - html.write(r) - os.utime(target, (self.mtime, self.mtime)) - - - def ping_websub(self): - if not self.taxonomy or self.taxonomy == 'category': - t = shared.config.get('site', 'websuburl') - data = { - 'hub.mode': 'publish', - 'hub.url': "%s%s" % ( - shared.config.get('site', 'url'), self.baseurl - ) - } - logging.info("pinging %s with data %s", t, data) - requests.post(t, data=data) - - - #def renderpage(self, renderer, page): - #pagination = int(shared.config.get('common', 'pagination')) - #start = int((page-1) * pagination) - #end = int(start + pagination) - - #posttmpls = [self.data[k].tmplvars for k in list(sorted( - #self.data.keys(), reverse=True))[start:end]] - - #target = self.tpath(page) - #logging.info("rendering taxonomy page %d to %s", page, target) - #tmplvars = { - #'taxonomy': { - #'url': self.baseurl, - #'name': self.name, - #'slug': self.slug, - #'taxonomy': self.taxonomy, - #'paged': page, - #'total': self.pages, - #'perpage': pagination, - #'lastmod': arrow.get(self.mtime).datetime - #}, - #'site': renderer.sitevars, - #'posts': posttmpls, - #} - - #r = renderer.j2.get_template('archive.html').render(tmplvars) - #with open(target, "wt") as html: - #html.write(r) - #os.utime(target, (self.mtime, self.mtime)) - - #if 1 == page: - ##target = os.path.join(self.feedp, 'index.rss') - ##logging.info("rendering RSS feed to %s", target) - ##r = renderer.j2.get_template('rss.html').render(tmplvars) - ##with open(target, "wt") as html: - ##html.write(r) - ##os.utime(target, (self.mtime, self.mtime)) - - #target = os.path.join(self.feedp, 'index.atom') - #logging.info("rendering Atom feed to %s", target) - #r = renderer.j2.get_template('atom.html').render(tmplvars) - #with open(target, "wt") as html: - #html.write(r) - #os.utime(target, (self.mtime, self.mtime)) - - ## --- - ## this is a joke - ## see http://indieweb.org/YAMLFeed - ## don't do YAMLFeeds. - #if 1 == page: - #fm = frontmatter.loads('') - #fm.metadata = { - #'site': { - #'author': renderer.sitevars['author'], - #'url': renderer.sitevars['url'], - #'title': renderer.sitevars['title'], - #}, - #'items': [], - #} - - #for p in posttmpls: - #fm.metadata['items'].append({ - #'title': p['title'], - #'url': "%s/%s/" % ( renderer.sitevars['url'], p['slug']), - #'content': p['content'], - #'summary': p['summary'], - #'published': p['published'], - #'updated': p['updated'], - #}) - - #target = os.path.join(self.feedp, 'index.yml') - #logging.info("rendering YAML feed to %s", target) - #with open(target, "wt") as html: - #html.write(frontmatter.dumps(fm)) - #os.utime(target, (self.mtime, self.mtime)) - ## --- - - #if 1 == page: - #if not self.taxonomy or self.taxonomy == 'category': - #t = shared.config.get('site', 'websuburl') - #data = { - #'hub.mode': 'publish', - #'hub.url': "%s%s" % ( - #shared.config.get('site', 'url'), self.baseurl - #) - #} - #logging.info("pinging %s with data %s", t, data) - #requests.post(t, data=data) - - -class Content(BaseIter): - def __init__(self, images, comments, extensions=['md']): - super(Content, self).__init__() - self.images = images - self.comments = comments - basepath = shared.config.get('source', 'contentdir') - self.files = [] - for ext in extensions: - self.files += glob.glob(os.path.join(basepath, "*", "*.%s" % ext)) - self.tags = {} - self.categories = {} - self.front = Taxonomy() - self.shortslugmap = {} - - - def populate(self): - now = arrow.utcnow().timestamp - for fpath in self.files: - item = Singular(fpath, self.images, self.comments) - self.append(item.pubtime, item) - #self.shortslugmap[item.shortslug] = item.fname - - if item.isfuture: - logging.warning("skipping future post %s", item.fname) - continue - - if item.isonfront: - self.front.append(item.pubtime, item) - - if item.iscategorised: - if item.category not in self.categories: - self.categories[item.category] = Taxonomy(item.category, 'category') - self.categories[item.category].append(item.pubtime, item) - - for tag in item.tags: - tslug = slugify(tag, only_ascii=True, lower=True) - if tslug not in self.tags: - self.tags[tslug] = Taxonomy(tag, 'tag', tslug) - self.tags[tslug].append(item.pubtime, item) - #self.symlinktag(tslug, item.path) - - #def symlinktag(self, tslug, fpath): - #fdir, fname = os.path.split(fpath) - #tagpath = os.path.join(shared.config.get('source', 'tagsdir'), tslug) - #if not os.path.isdir(tagpath): - #os.mkdir(tagpath) - #sympath = os.path.relpath(fdir, tagpath) - #dst = os.path.join(tagpath, fname) - #src = os.path.join(sympath, fname) - #if not os.path.islink(dst): - #os.symlink(src, dst) - - def sitemap(self): - target = os.path.join( - shared.config.get('target', 'builddir'), - 'sitemap.txt' - ) - urls = [] - for item in self.data.values(): - urls.append( "%s/%s/" % ( - shared.config.get('site', 'url'), - item.fname - )) - - with open(target, "wt") as f: - logging.info("writing sitemap to %s" % (target)) - f.write("\n".join(urls)) - - - def magicphp(self, renderer): - redirects = [] - gones = [] - rfile = os.path.join( - shared.config.get('common', 'basedir'), - shared.config.get('common', 'redirects') - ) - if os.path.isfile(rfile): - with open(rfile, newline='') as csvfile: - r = csv.reader(csvfile, delimiter=' ') - for row in r: - redirects.append((row[0], row[1])) - for item in self.data.values(): - redirects.append((item.shortslug, item.fname)) - - rfile = os.path.join( - shared.config.get('common', 'basedir'), - shared.config.get('common', 'gone') - ) - if os.path.isfile(rfile): - with open(rfile, newline='') as csvfile: - r = csv.reader(csvfile, delimiter=' ') - for row in r: - gones.append(row[0]) - - tmplvars = { - 'site': renderer.sitevars, - 'redirects': redirects, - 'gones': gones - } - - r = renderer.j2.get_template("magic.php").render(tmplvars) - target = os.path.abspath(os.path.join( - shared.config.get('target', 'builddir'), - 'magic.php' - )) - - with open(target, "w") as html: - logging.debug('writing %s', target) - html.write(r) - html.close() - - -class Singular(BaseRenderable): - def __init__(self, path, images = None, comments = None): - logging.debug("initiating singular object from %s", path) - self.path = path - self.images = images or Images() - self.allcomments = comments or Comments() - self.category = splitpath(path)[-2] - self.mtime = int(os.path.getmtime(self.path)) - self.fname, self.ext = os.path.splitext(os.path.basename(self.path)) - self.meta = {} - self.content = '' - self.photo = self.images.data.get("%s.jpg" % self.fname, None) - if self.photo: - self.photo.singleimage = True - self.__parse() - - - def __repr__(self): - return "%s (lastmod: %s)" % (self.fname, self.published) - - - def __parse(self): - with open(self.path, mode='rt') as f: - self.meta, self.content = frontmatter.parse(f.read()) - #self.__filter_syndication() - self.__filter_favs() - self.__filter_images() - #if self.isphoto: - #self.content = "%s\n\n%s" % ( - #self.photo, - #self.content, - #) - #if shared.config.getboolean('params', 'nooffline'): - #return - #trigger = self.offlinecopies - - #def __filter_syndication(self): - #syndications = self.meta.get('syndicate', None) - #if not syndications: - #return - #s = "\n\n".join(['' % s for s in syndications]) - #self.content = "%s\n\n%s" % ( - #s, - #self.content - #) - - def __filter_favs(self): - url = self.meta.get('favorite-of', - self.meta.get('like-of', - self.meta.get('bookmark-of', - False - ) - ) - ) - - if not url: - return - - img = self.meta.get('image', False) - imgs = self.meta.get('images', []) - if img: - imgs.append(img) - - if not imgs or not len(imgs): - return - - c = '' - for i in imgs: - c = '%s\n[![%s](/%s/%s)](%s){.favurl}' % ( - c, - self.title, - shared.config.get('source', 'files'), - i, - url - ) - - if self.isbookmark: - c = "%s\n\n%s" % (c, self.content) - - self.content = c - - - def __filter_images(self): - linkto = False - isrepost = None - - if len(self.reactions.keys()): - isrepost = list(self.reactions.keys())[0] - if isrepost and \ - len(self.reactions[isrepost]) == 1: - linkto = self.reactions[isrepost][0] - - m = shared.MDIMGREGEX.findall(self.content) - if not m: - logging.debug("no images found") - return - - for shortcode, alt, fname, title, cl in m: - image = self.images.data.get(fname, None) - if not image: - logging.debug("%s not found in images", fname) - continue - - if cl: - image.cl = cl - - logging.debug( - "replacing %s in content with %s", - shortcode, - "%s" % image - ) - self.content = self.content.replace( - shortcode, - "%s" % image - ) - - - @property - def comments(self): - if hasattr(self, '_comments'): - return self._comments - - # the comments could point to both the "real" url and the shortslug - # so I need to get both - c = {} - for by in [self.fname, self.shortslug]: - c = {**c, **self.allcomments[by].data} - #self._comments = [c[k].tmplvars for k in list(sorted(c.keys(), reverse=True))] - self._comments = [c[k] for k in list(sorted(c.keys(), reverse=False))] - return self._comments - - - @property - def replies(self): - if hasattr(self, '_replies'): - return self._replies - self._replies = [c.tmplvars for c in self.comments if not len(c.reacji)] - return self._replies - - - @property - def reacjis(self): - if hasattr(self, '_reacjis'): - return self._reacjis - reacjis = {} - for c in self.comments: - rj = c.reacji - - if not len(rj): - continue - - if not reacjis.get(rj, False): - reacjis[rj] = [] - - reacjis[rj].append(c.tmplvars) - - self._reacjis = reacjis - return self._reacjis - - - @property - def reactions(self): - if hasattr(self, '_reactions'): - return self._reactions - # getting rid of '-' to avoid css trouble and similar - convert = { - 'bookmark-of': 'bookmark', - 'repost-of': 'repost', - 'in-reply-to': 'reply', - 'favorite-of': 'fav', - 'like-of': 'like', - } - reactions = {} - - for k, v in convert.items(): - x = self.meta.get(k, None) - if not x: - continue - if isinstance(x, str): - x = [x] - reactions[v] = x - - self._reactions = reactions - return self._reactions - - @property - def author(self): - return dict(shared.config.items('author')) - - @property - def license(self): - if hasattr(self, '_licence'): - return self._licence - - - if 'article' == self.category: - l = { - 'url': 'https://creativecommons.org/licenses/by/4.0/', - 'text': 'CC BY 4.0', - } - elif 'journal' == self.category: - l = { - 'url': 'https://creativecommons.org/licenses/by-nc/4.0/', - 'text': 'CC BY-NC 4.0', - } - else: - l = { - 'url': 'https://creativecommons.org/licenses/by-nc-nd/4.0/', - 'text': 'CC BY-NC-ND 4.0', - } - - self._licence = l - return self._licence - - @property - def syndicate(self): - return self.meta.get('syndicate', []) - - @property - def urls(self): - if hasattr(self, '_urls'): - return self._urls - - #urls = shared.URLREGEX.findall(self.content) - #urls = [*urls, *self.syndicate] - urls = list(self.syndicate) - - for reactionurls in self.reactions.values(): - urls = [*urls, *reactionurls] - - #r = [] - #logging.debug('searching for urls for %s', self.fname) - #for link in urls: - #purl = urllib.parse.urlparse(link) - #if purl.netloc in shared.config.get('site', 'domains'): - #logging.debug('excluding url %s - %s matches %s', link, purl.netloc, shared.config.get('site', 'domains')) - #continue - #if link in r: - #continue - #r.append(link) - - self._urls = urls - logging.debug('urls for %s: %s', self.fname, urls) - return self._urls - - - @property - def lang(self): - if hasattr(self, '_lang'): - return self._lang - - lang = 'en' - try: - lang = langdetect.detect("\n".join([ - self.title, - self.content - ])) - except: - pass - self._lang = lang - return self._lang - - - @property - def tags(self): - return list(self.meta.get('tags', [])) - - - @property - def published(self): - if hasattr(self, '_published'): - return self._published - self._published = arrow.get( - self.meta.get('published', self.mtime) - ) - return self._published - - - @property - def updated(self): - if hasattr(self, '_updated'): - return self._updated - self._updated = arrow.get( - self.meta.get('updated', - self.meta.get('published', self.mtime) - ) - ) - return self._updated - - - @property - def pubtime(self): - return int(self.published.timestamp) - - - @property - def isphoto(self): - if not self.photo: - return False - return self.photo.is_photo - - - @property - def isbookmark(self): - return self.meta.get('bookmark-of', False) - - - @property - def isreply(self): - return self.meta.get('in-reply-to', False) - - @property - def isfuture(self): - now = arrow.utcnow().timestamp - if self.pubtime > now: - return True - return False - - # TODO - #@property - #def isrvsp(self): - # r'([^<]+)' - - - @property - def isfav(self): - r = False - for maybe in ['like-of', 'favorite-of']: - maybe = self.meta.get(maybe, False) - if maybe: - r = maybe - break - return r - - - @property - def ispage(self): - if not self.meta: - return True - return False - - - @property - def isonfront(self): - if self.ispage: - return False - if self.isbookmark: - return False - if self.isfav: - return False - return True - - - @property - def iscategorised(self): - if self.ispage: - return False - return True - - - @property - def summary(self): - return self.meta.get('summary', '') - - - @property - def title(self): - if hasattr(self, '_title'): - return self._title - - self._title = '' - for maybe in ['title', 'bookmark-of', 'in-reply-to', 'repost-of']: - maybe = self.meta.get(maybe, False) - if maybe: - if isinstance(maybe, list): - maybe = maybe.pop() - self._title = maybe.replace('\n', ' ').replace('\r', '') - break - return self._title - - - @property - def url(self): - return "%s/%s/" % (shared.config.get('site', 'url'), self.fname) - - - @property - def tmplfile(self): - if self.ispage: - return 'page.html' - else: - return 'singular.html' - - - @property - def html(self): - if hasattr(self, '_html'): - return self._html - self._html = shared.Pandoc().convert(self.content) - return self._html - - - @property - def sumhtml(self): - if hasattr(self, '_sumhtml'): - return self._sumhtml - self._sumhtml = self.meta.get('summary', '') - if len(self._sumhtml): - self._sumhtml = shared.Pandoc().convert(self.summary) - return self._sumhtml - - - #@property - #def offlinecopies(self): - ## stupidly simple property caching - #if hasattr(self, 'copies'): - #return self.copies - - #copies = {} - #for maybe in ['bookmark-of', 'in-reply-to', 'repost-of', 'favorite-of']: - #maybe = self.meta.get(maybe, False) - #if not maybe: - #continue - #if not isinstance(maybe, list): - #maybe = [maybe] - #for url in maybe: - #arch = OfflineArchive(url) - #arch.run() - ##copies[url] = arch.read() - - ##self.copies = copies - ##return copies - - - @property - def exif(self): - if not self.isphoto: - return {} - return self.photo.exif - - - #@property - #def rssenclosure(self): - #if not self.isphoto: - #return {} - #return self.photo.rssenclosure - @property def tmplvars(self): - if hasattr(self, '_tmplvars'): - return self._tmplvars - - self._tmplvars = { + return { + 'src': self.src, + 'target': self.href, + 'css': self.cssclass, 'title': self.title, - 'published': self.published.datetime, - 'tags': self.tags, - 'author': self.author, - 'content': self.content, - 'html': self.html, - 'category': self.category, - 'reactions': self.reactions, - 'updated': self.updated.datetime, - 'summary': self.summary, - 'sumhtml': self.sumhtml, + 'alt': self.alt, 'exif': self.exif, - 'lang': self.lang, - 'syndicate': self.syndicate, - 'slug': self.fname, - 'shortslug': self.shortslug, - 'comments': self.comments, - 'replies': self.replies, - 'reacjis': self.reacjis, - 'photo': {}, - 'rssenclosure': {}, - 'license': self.license, + 'is_photo': self.is_photo, + 'author': self.meta.get('Artist', ''), } - if self.isphoto: - self._tmplvars.update({ - 'photo': self.photo.tmplvars, - 'rssenclosure': self.photo.rssenclosure - }) - return self._tmplvars + def __repr__(self): + return "Image: %s, photo: %r, EXIF: %s" % ( + self.fname, self.is_photo, self.exif + ) + + def __str__(self): + tmplfile = "%s.html" % (__class__.__name__) + return shared.j2.get_template(tmplfile).render({'photo': self.tmplvars}) - @property - def shortslug(self): - if hasattr(self, '_shortslug'): - return self._shortslug - self._shortslug = shared.baseN(self.pubtime) - return self._shortslug +def setup(): + """ parse input parameters and add them as params section to config """ + parser = argparse.ArgumentParser(description='Parameters for NASG') + booleanparams = { + 'regenerate': 'force downsizing images', + 'force': 'force rendering HTML', + } - @property - def target(self): - targetdir = os.path.abspath(os.path.join( - shared.config.get('target', 'builddir'), - self.fname - )) - return os.path.join(targetdir, 'index.html') - - - async def rendercomments(self, renderer): - for comment in self.comments: - await comment.render(renderer) - - - async def render(self, renderer = None): - renderer = renderer or Renderer() - # this is only when I want salmentions and I want to include all of the comments as well - # otherwise it affects both webmentions sending and search indexing - #if len(self.comments): - #lctime = self.comments[0].mtime - #if lctime > self.mtime: - #self.mtime = lctime - #await self.rendercomments(renderer) - - mtime = self.mtime - if len(self.comments): - lctime = self.comments[0].mtime - if lctime > self.mtime: - mtime = lctime - - logging.info("rendering and saving %s", self.fname) - if not shared.config.getboolean('params', 'force') and os.path.isfile(self.target): - ttime = int(os.path.getmtime(self.target)) - logging.debug('ttime is %d mtime is %d', ttime, mtime) - if ttime == mtime: - logging.debug( - '%s exists and up-to-date (lastmod: %d)', - self.target, - ttime - ) - return - - tmplvars = { - 'post': self.tmplvars, - 'site': renderer.sitevars, - 'taxonomy': {}, - } - r = renderer.j2.get_template(self.tmplfile).render(tmplvars) - self.writerendered(r, mtime) - - - async def ping(self, pinger): - if self.isfuture: - return - - logging.debug('urls in %s: %s', self.fname, self.urls) - for target in self.urls: - record = { - 'mtime': self.mtime, - 'source': self.url, - 'target': target - } - h = json.dumps(record, sort_keys=True) - h = hashlib.sha1(h.encode('utf-8')).hexdigest() - if pinger.db.get(h, False): - logging.debug( - "%s is already pinged from %s @ %d, skipping", - target, self.url, self.mtime - ) - continue - - logging.info("sending webmention from %s to %s", self.url, target) - ws = WebmentionSend(self.url, target) - try: - ws.send(allow_redirects=True, timeout=30) - except Exception as e: - logging.error('ping failed to %s', target) - - pinger.db[h] = record - - -class Webmentioner(object): - def __init__(self): - self.dbpath = os.path.abspath(os.path.join( - shared.config.get('target', 'builddir'), - shared.config.get('var', 'webmentions') - )) - - if os.path.isfile(self.dbpath): - with open(self.dbpath, 'rt') as f: - self.db = json.loads(f.read()) - else: - self.db = {} - - - def finish(self): - with open(self.dbpath, 'wt') as f: - f.write(json.dumps(self.db, sort_keys=True, indent=4)) - - -class NASG(object): - lockfile = os.path.join(tempfile.gettempdir(), 'nasg_%s.lock' % getpass.getuser()) - - def __init__(self): - # --- set params - parser = argparse.ArgumentParser(description='Parameters for NASG') + for k, v in booleanparams.items(): parser.add_argument( - '--clear', + '--%s' % (k), action='store_true', default=False, - help='clear build directory in advance' - ) - parser.add_argument( - '--regenerate', - action='store_true', - default=False, - help='force downsizing images' - ) - parser.add_argument( - '--force', - action='store_true', - default=False, - help='force rendering HTML' - ) - parser.add_argument( - '--loglevel', - default='error', - help='change loglevel' - ) - parser.add_argument( - '--nooffline', - action='store_true', - default=False, - help='skip offline copie checks' - ) - parser.add_argument( - '--nodownsize', - action='store_true', - default=False, - help='skip image downsizing' - ) - parser.add_argument( - '--norender', - action='store_true', - default=False, - help='skip rendering' - ) - parser.add_argument( - '--refetch', - action='store_true', - default=False, - help='force re-fetching offline archives' + help = v ) - params = vars(parser.parse_args()) + parser.add_argument( + '--loglevel', + default='warning', + help='change loglevel' + ) + + if not shared.config.has_section('params'): shared.config.add_section('params') - for k, v in params.items(): - shared.config.set('params', k, str(v)) + params = vars(parser.parse_args()) + for k, v in params.items(): + shared.config.set('params', k, str(v)) - # remove the rest of the potential loggers - while len(logging.root.handlers) > 0: - logging.root.removeHandler(logging.root.handlers[-1]) + # remove the rest of the potential loggers + while len(logging.root.handlers) > 0: + logging.root.removeHandler(logging.root.handlers[-1]) - # --- set loglevel - logging.basicConfig( - level=shared.LLEVEL[shared.config.get('params', 'loglevel')], - format='%(asctime)s - %(levelname)s - %(message)s' - ) + logging.basicConfig( + level=shared.LLEVEL[shared.config.get('params', 'loglevel')], + format='%(asctime)s - %(levelname)s - %(message)s' + ) - async def __adownsize(self): - for fname, img in self.images: - await img.downsize(self.existing) +def build(): + setup() + loop = asyncio.get_event_loop() + tasks = [] + content = Content() + sdb = db.SearchDB() + magic = MagicPHP() - async def __acrender(self): - for (pubtime, singular) in self.content: - await singular.render(self.renderer) + collector_front = Category() + collector_categories = NoDupeContainer() - async def __atrender(self): - for e in [self.content.categories, self.content.tags]: - for name, t in e.items(): - await t.render(self.renderer) + for f, post in content: + logging.info("PARSING %s", f) - async def __afrender(self): - await self.content.front.render(self.renderer) + # extend redirects + for r in post.redirects: + magic.redirects.append((r, post.fname)) - async def __aindex(self): - for (pubtime, singular) in self.content: - await self.searchdb.append(singular) - - async def __aping(self): - for (pubtime, singular) in self.content: - await singular.ping(self.pinger) - - def __atindexrender(self): - m = { - 'category': self.content.categories, - 'tag': self.content.tags - } - - for p, taxonomy in m.items(): - target = os.path.abspath(os.path.join( - shared.config.get('target', 'builddir'), - p, - 'index.html' - )) - - tmplvars = { - 'site': self.renderer.sitevars, - 'taxonomy': {}, - 'taxonomies': {} - } - - for name, t in taxonomy.items(): - logging.debug('adding %s to taxonomyindex' % name) - tmplvars['taxonomies'][name] = [{'title': t.data[k].title, 'url': t.data[k].url} for k in list(sorted( - t.data.keys(), reverse=True))] - - tmplvars['taxonomies'] = sorted(tmplvars['taxonomies'].items()) - logging.debug('rendering taxonomy index to %s', target) - r = self.renderer.j2.get_template('archiveindex.html').render(tmplvars) - with open(target, "wt") as html: - html.write(r) - - @property - def images(self): - if hasattr(self, '_images'): - return self._images - logging.info("discovering images") - images = Images() - images.populate() - self._images = images - return self._images - - @property - def content(self): - if hasattr(self, '_content'): - return self._content - logging.info("discovering content") - content = Content(self.images, self.comments) - content.populate() - self._content = content - return self._content - - @property - def comments(self): - if hasattr(self, '_comments'): - return self._comments - logging.info("discovering comments") - comments = Comments() - comments.populate() - self._comments = comments - return self._comments - - @property - def existing(self): - if hasattr(self, '_existing'): - return self._existing - existing = glob.glob(os.path.join( - shared.config.get('target', 'filesdir'), - "*" - )) - self._existing = existing - return self._existing - - @property - def renderer(self): - if hasattr(self, '_renderer'): - return self._renderer - self._renderer = Renderer() - return self._renderer - - @property - def searchdb(self): - if hasattr(self, '_searchdb'): - return self._searchdb - self._searchdb = Indexer() - return self._searchdb - - - @property - def pinger(self): - if hasattr(self, '_pinger'): - return self._pinger - self._pinger = Webmentioner() - return self._pinger - - - def run(self): - if os.path.isfile(self.lockfile): - raise ValueError( - "Lockfile is present at %s; another instance is running." % ( - self.lockfile - ) + # add post to search, if needed + if not sdb.is_uptodate(post.fname, post.mtime): + sdb.append( + post.fname, + post.corpus, + post.mtime, + post.url, + post.category, + post.title ) + + # add render task, if needed + if not post.is_uptodate or shared.config.get('params', 'force'): + task = loop.create_task(post.render()) + tasks.append(task) + + # collect images to downsize + for fname, im in post.images: + task = loop.create_task(im.downsize()) + tasks.append(task) + + # skip categories starting with _ + if post.category.startswith('_'): + continue + # get the category otherwise + elif post.category not in collector_categories : + c = Category(post.category) + collector_categories.append(post.category, c) else: - atexit.register(os.remove, self.lockfile) - with open(self.lockfile, "wt") as f: - f.write(arrow.utcnow().format()) + c = collector_categories[post.category] - if shared.config.getboolean('params', 'clear'): - input('about to clear build directory, press enter to continue') - shutil.rmtree(os.path.abspath( - shared.config.get('target', 'builddir') - )) + # add post to category + c.append(post) - loop = asyncio.get_event_loop() - - for d in shared.config.options('target'): - if 'dir' in d and not os.path.isdir(shared.config.get('target', d)): - os.mkdir(shared.config.get('target', d)) - - if not shared.config.getboolean('params', 'nodownsize'): - logging.info("downsizing images") - loop.run_until_complete(self.__adownsize()) - - if not shared.config.getboolean('params', 'norender'): - logging.info("rendering content") - loop.run_until_complete(self.__acrender()) - - logging.info("rendering categories and tags") - loop.run_until_complete(self.__atrender()) - - logging.info("rendering the front page elements") - loop.run_until_complete(self.__afrender()) - - logging.info("rendering taxonomy indexes") - self.__atindexrender() - - logging.info("rendering sitemap") - self.content.sitemap() + # add post to front + collector_front.append(post) - logging.info("render magic.php") - self.content.magicphp(self.renderer) + # write search db + sdb.finish() - logging.info("copy the static bits") - src = shared.config.get('source', 'staticdir') - for item in os.listdir(src): - s = os.path.join(src, item) - d = os.path.join(shared.config.get('target', 'builddir'), item) - logging.debug("copying %s to %s", s, d) + # render front + task = loop.create_task(collector_front.render()) + tasks.append(task) + + # render categories + for name, c in collector_categories: + task = loop.create_task(c.render()) + tasks.append(task) + + # add magic.php rendering + task = loop.create_task(magic.render()) + tasks.append(task) + + # TODO: send webmentions to any url + # TODO: comments + # TODO: ping websub? + + # do all the things! + w = asyncio.wait(tasks) + loop.run_until_complete(w) + loop.close() + + # copy static + logging.info('copying static files') + src = shared.config.get('dirs', 'static') + for item in os.listdir(src): + s = os.path.join(src,item) + d = os.path.join(shared.config.get('common', 'build'),item) + if not os.path.exists(d): + logging.debug("copying static file %s to %s", s, d) shutil.copy2(s, d) - logging.info("pouplating searchdb") - - loop.run_until_complete(self.__aindex()) - self.searchdb.finish() - - logging.info("webmentioning urls") - loop.run_until_complete(self.__aping()) - self.pinger.finish() - - loop.close() - if __name__ == '__main__': - worker = NASG() - worker.run() + build() diff --git a/requirements.txt b/requirements.txt index 8df2aa7..530fffe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,30 +1,8 @@ -aiofiles==0.3.1 -appdirs==1.4.3 arrow==0.10.0 -breadability==0.1.20 -chardet==3.0.3 -decorator==4.0.11 -docopt==0.6.2 -httptools==0.0.9 Jinja2==2.9.6 langdetect==1.0.7 -lxml==3.7.3 -MarkupSafe==1.0 -packaging==16.8 -pyparsing==2.2.0 -python-dateutil==2.6.0 -python-frontmatter==0.4.2 -python-magic==0.4.13 -PyYAML==3.12 -requests==2.14.2 -sanic==0.5.4 -similar-text==0.2.0 -six==1.10.0 -ujson==1.35 +requests==2.12.4 +requests-oauthlib==0.8.0 +sanic==0.6.0 unicode-slugify==0.1.3 -Unidecode==0.4.20 -uvloop==0.8.0 -validators==0.11.3 Wand==0.4.4 -websockets==3.3 -Whoosh==2.7.4 diff --git a/router.py b/router.py new file mode 100644 index 0000000..f86ef6a --- /dev/null +++ b/router.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 + +#import asyncio +#import uvloop +from sanic import Sanic +import sanic.response +import logging +import db +import shared +import validators +import urllib.parse + +if __name__ == '__main__': + logging_format = "[%(asctime)s] %(process)d-%(levelname)s " + logging_format += "%(module)s::%(funcName)s():l%(lineno)d: " + logging_format += "%(message)s" + + logging.basicConfig( + format=logging_format, + level=logging.DEBUG + ) + log = logging.getLogger() + + # log_config=None prevents creation of access_log and error_log files + # since I'm running this from systemctl it already goes into syslog + app = Sanic('router', log_config=None) + # this is ok to be read-only + sdb = db.SearchDB() + + + @app.route("/oauth1", methods=["GET"]) + async def oauth1(request): + token = request.args.get('oauth_token') + verifier = request.args.get('oauth_verifier') + tokendb = shared.TokenDB() + tokendb.update_token( + token, + verifier=verifier + ) + return sanic.response.text("OK",status=200) + + + @app.route("/search", methods=["GET"]) + async def search(request): + query = request.args.get('s') + r = sdb.html(query) + response = sanic.response.html(r, status=200) + return response + + + @app.route("/micropub", methods=["POST","GET"]) + async def micropub(request): + return sanic.response.text("Not Implemented", status=501) + + + @app.route("/webmention", methods=["POST"]) + async def webmention(request): + source = request.form.get('source') + target = request.form.get('target') + + # validate urls + if not validators.url(source): + return sanic.response.text('Invalide source url', status=400) + if not validators.url(target): + return sanic.response.text('Invalide target url', status=400) + + # check if our site is actually the target for the webmention + _target = urllib.parse.urlparse(target) + if _target.hostname not in shared.config.get('site', 'domains'): + return sanic.response.text('target domain is not me', status=400) + + # ignore selfpings + _source = urllib.parse.urlparse(source) + if _source.hostname in shared.config.get('site', 'domains'): + return sanic.response.text('selfpings are not allowed', status=400) + + # it is unfortunate that I need to init this every time, but + # otherwise it'll become read-only for reasons I'm yet to grasp + # the actual parsing will be done at site generation time + wdb = db.WebmentionQueue() + wdb.queue(source,target) + response = sanic.response.text("Accepted", status=202) + return response + + + app.run(host="127.0.0.1",port=8008, log_config=None) diff --git a/shared.py b/shared.py index 7e56a4e..7f7d08a 100644 --- a/shared.py +++ b/shared.py @@ -5,131 +5,10 @@ import glob import logging import subprocess import json -import requests -from urllib.parse import urlparse, urlunparse +import sqlite3 -from whoosh import fields -from whoosh import analysis from slugify import slugify - -LLEVEL = { - 'critical': 50, - 'error': 40, - 'warning': 30, - 'info': 20, - 'debug': 10 -} - - -def __expandconfig(config): - """ add the dirs to the config automatically """ - basepath = os.path.expanduser(config.get('common','base')) - config.set('common', 'basedir', basepath) - for section in ['source', 'target']: - for option in config.options(section): - opt = config.get(section, option) - config.set(section, "%sdir" % option, os.path.join(basepath,opt)) - config.set('target', 'filesdir', os.path.join( - config.get('target', 'builddir'), - config.get('source', 'files'), - )) - config.set('target', 'commentsdir', os.path.join( - config.get('target', 'builddir'), - config.get('site', 'commentspath'), - )) - return config - - -def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"): - """ Used to create short, lowercase slug for a number (an epoch) passed """ - num = int(num) - return ((num == 0) and numerals[0]) or ( - baseN( - num // b, - b, - numerals - ).lstrip(numerals[0]) + numerals[num % b] - ) - -def slugfname(url): - return "%s" % slugify( - re.sub(r"^https?://(?:www)?", "", url), - only_ascii=True, - lower=True - )[:200] - -ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ' -STRFISO = '%Y-%m-%dT%H:%M:%S%z' - -URLREGEX = re.compile( - r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+' - r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*' -) - -EXIFREXEG = re.compile( - r'^(?P[0-9]{4}):(?P[0-9]{2}):(?P[0-9]{2})\s+' - r'(?P