commit f5c599cef923afe23974252f778dd4e4cb214b80 Author: Peter Molnar Date: Tue May 23 11:10:30 2017 +0100 1.0 version diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bf84ffe --- /dev/null +++ b/.gitignore @@ -0,0 +1,103 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +config.ini +config.yml diff --git a/cache.py b/cache.py new file mode 100644 index 0000000..22d05bc --- /dev/null +++ b/cache.py @@ -0,0 +1,56 @@ +import os +import json +import hashlib +import logging +import glob + +class Cached(object): + def __init__(self, hash='', text='', stime=0): + + if not os.path.isdir(glob.CACHE): + os.mkdir(glob.CACHE) + + if hash: + self._hbase = hash + elif text: + self._hbase = hashlib.sha1(text.encode('utf-8')).hexdigest() + else: + print("No identifier passed for Cached") + raise + + self._cpath = os.path.join(glob.CACHE, self._hbase) + self._stime = stime + + if os.path.isfile(self._cpath): + self._ctime = os.stat(self._cpath) + else: + self._ctime = None + + def get(self): + if not glob.CACHEENABLED: + return None + + cached = '' + if os.path.isfile(self._cpath): + if self._stime and self._stime.st_mtime == self._ctime.st_mtime: + logging.debug("Cache exists at %s; using it" % (self._cpath )) + with open(self._cpath, 'r') as c: + cached = c.read() + c.close() + # invalidate old + elif self._stime and self._stime.st_mtime > self._ctime.st_mtime: + logging.debug("invalidating cache at %s" % (self._cpath )) + os.remove(self._cpath) + + return cached + + def set(self, content): + if not glob.CACHEENABLED: + return None + + with open(self._cpath, "w") as c: + logging.debug("writing cache to %s" % (self._cpath )) + c.write(content) + c.close() + if self._stime: + os.utime(self._cpath, (self._stime.st_mtime, self._stime.st_mtime )) \ No newline at end of file diff --git a/generator.py b/generator.py new file mode 100644 index 0000000..6a365d4 --- /dev/null +++ b/generator.py @@ -0,0 +1,293 @@ +#!/home/petermolnar.net/.venv/bin/python3.5 + +"""Usage: generator.py [-h] [-f] [-g] [-p] [-d] [-s FILE] + +-h --help show this +-f --force force HTML file rendering +-p --pandoc force re-rendering content HTML +-g --regenerate regenerate images +-s --single FILE only (re)generate a single entity +-d --debug set logging level +""" + +import os +import shutil +import logging +import atexit +import json +import sys +import tempfile +import glob +from whoosh import index +from docopt import docopt +from ruamel import yaml +from webmentiontools.send import WebmentionSend +import taxonomy +import singular +from slugify import slugify +import arrow + + +class Engine(object): + lockfile = "/tmp/petermolnar.net.generator.lock" + + def __init__(self): + if os.path.isfile(self.lockfile): + raise ValueError("Lockfile %s is present; generator won't run.") + else: + with open(self.lockfile, "w") as lock: + lock.write(arrow.utcnow().format()) + lock.close() + + atexit.register(self.removelock) + atexit.register(self.removetmp) + + self._mkdirs() + self.tags = {} + self.category = {} + self.allposts = None + self.frontposts = None + + self.slugsdb = os.path.join(glob.CACHE, "slugs.json") + if os.path.isfile(self.slugsdb): + with open(self.slugsdb) as slugsdb: + self.allslugs = json.loads(slugsdb.read()) + slugsdb.close() + else: + self.allslugs = [] + + self.tmpwhoosh = tempfile.mkdtemp('whooshdb_', dir=tempfile.gettempdir()) + self.whoosh = index.create_in(self.tmpwhoosh, glob.schema) + + + def removelock(self): + os.unlink(self.lockfile) + + + def removetmp(self): + if os.path.isdir(self.tmpwhoosh): + for root, dirs, files in os.walk(self.tmpwhoosh, topdown=False): + for f in files: + os.remove(os.path.join(root, f)) + for d in dirs: + os.rmdir(os.path.join(root, d)) + + + def initbuilder(self): + self._copy_and_compile() + + + def cleanup(self): + with open(os.path.join(glob.CACHE, "slugs.json"), "w") as db: + logging.info("updating slugs database") + db.write(json.dumps(self.allslugs)) + db.close() + + tags = [] + for tslug, taxonomy in self.tags.items(): + tags.append(taxonomy.name) + + with open(os.path.join(glob.CACHE, "tags.json"), "w") as db: + logging.info("updating tags database") + db.write(json.dumps(tags)) + db.close() + + logging.info("deleting old searchdb") + shutil.rmtree(glob.SEARCHDB) + logging.info("moving new searchdb") + shutil.move(self.tmpwhoosh, glob.SEARCHDB) + + + def _mkdirs(self): + for d in [glob.TARGET, glob.TFILES, glob.TTHEME, glob.CACHE]: + if not os.path.isdir(d): + os.mkdir(d) + + + def _copy_and_compile(self): + for f in os.listdir(glob.STHEME): + p = os.path.join(glob.STHEME, f) + if os.path.isdir(p): + try: + shutil.copytree(p, os.path.join(glob.TTHEME, f)) + except FileExistsError: + pass + else: + path, fname = os.path.split(p) + fname, ext = os.path.splitext(fname) + logging.debug("copying %s", p) + shutil.copy(p, os.path.join(glob.TTHEME, f)) + + @staticmethod + def postbycategory(fpath, catd=None, catn=None): + if catd == 'photo': + post = singular.PhotoHandler(fpath, category=catn) + elif catd == 'page': + post = singular.PageHandler(fpath) + else: + post = singular.ArticleHandler(fpath, category=catn) + + return post + + def collect(self): + self.allposts = taxonomy.TaxonomyHandler() + #self.gallery = taxonomy.TaxonomyHandler(taxonomy="photography", name="Photography") + self.frontposts = taxonomy.TaxonomyHandler() + + for category in glob.conf['category'].items(): + catn, catd = category + catp = os.path.abspath(os.path.join(glob.CONTENT, catn)) + + if not os.path.exists(catp): + continue + + logging.debug("getting posts for category %s from %s", catn, catp) + + cat = taxonomy.TaxonomyHandler(taxonomy='category', name=catn) + self.category[catn] = cat + + for f in os.listdir(catp): + fpath = os.path.join(catp, f) + + if not os.path.isfile(fpath): + continue + + logging.debug("parsing %s", fpath) + exclude = False + if 'exclude' in catd: + exclude = bool(catd['exclude']) + + ct = None + if 'type' in catd: + ct = catd['type'] + + post = Engine.postbycategory(fpath, catd=ct, catn=catn) + + self.allposts.append(post) + if post.dtime > arrow.utcnow().timestamp: + logging.warning( + "Post '%s' will be posted in the future; " + "skipping it from Taxonomies for now", fpath + ) + else: + cat.append(post) + if not exclude: + self.frontposts.append(post) + if hasattr(post, 'tags') and isinstance(post.tags, list): + for tag in post.tags: + tslug = slugify(tag, only_ascii=True, lower=True) + if not tslug in self.tags.keys(): + t = taxonomy.TaxonomyHandler(taxonomy='tag', name=tag) + self.tags[tslug] = t + else: + t = self.tags[tslug] + t.append(post) + elif not hasattr(post, 'tags'): + logging.error("%s post does not have tags", post.fname) + elif not isinstance(post.tags, list): + logging.error( + "%s tags are not a list, it's %s ", + post.fname, + type(post.tags) + ) + + + for r in post.redirect.keys(): + self.allslugs.append(r) + self.allslugs.append(post.fname) + + + def renderposts(self): + for p in self.allposts.posts.items(): + time, post = p + post.write() + post.redirects() + post.pings() + post.index(self.whoosh) + + + def rendertaxonomies(self): + for t in [self.tags, self.category]: + for tname, tax in t.items(): + if glob.conf['category'].get(tname, False): + if glob.conf['category'][tname].get('nocollection', False): + + logging.info("skipping taxonomy '%s' due to config nocollections", tname) + continue + + tax.write_paginated() + tax.index(self.whoosh) + self.frontposts.write_paginated() + #self.gallery.write_simple(template='gallery.html') + self.allposts.writesitemap() + + def globredirects(self): + redirects = os.path.join(glob.CONTENT,'redirects.yml') + + if not os.path.isfile(redirects): + return + + ftime = os.stat(redirects) + rdb = {} + with open(redirects, 'r') as db: + rdb = yaml.safe_load(db) + db.close() + + for r_ in rdb.items(): + target, slugs = r_ + for slug in slugs: + singular.SingularHandler.write_redirect( + slug, + "%s/%s" % (glob.conf['site']['url'], target), + ftime.st_mtime + ) + + def recordlastrun(self): + if os.path.exists(glob.lastrun): + t = arrow.utcnow().timestamp + os.utime(glob.lastrun, (t,t)) + else: + open(glob.lastrun, 'a').close() + + +if __name__ == '__main__': + + args = docopt(__doc__, version='generator.py 0.2') + + if args['--pandoc']: + glob.CACHEENABLED = False + + if args['--force']: + glob.FORCEWRITE = True + + if args['--regenerate']: + glob.REGENERATE = True + + logform = '%(asctime)s - %(levelname)s - %(message)s' + if args['--debug']: + loglevel = 10 + else: + loglevel = 40 + + + while len(logging.root.handlers) > 0: + logging.root.removeHandler(logging.root.handlers[-1]) + logging.basicConfig(level=loglevel, format=logform) + + if args['--single']: + logging.info("(re)generating a single item only") + path = args['--single'].split('/') + fpath = os.path.join(glob.CONTENT, path[0], path[1]) + post = Engine.postbycategory(fpath, catd=path[0]) + post.pings() + post.write() + sys.exit(0) + else: + eng = Engine() + eng.initbuilder() + eng.collect() + eng.renderposts() + eng.globredirects() + eng.rendertaxonomies() + eng.recordlastrun() + eng.cleanup() \ No newline at end of file diff --git a/glob.py b/glob.py new file mode 100644 index 0000000..1ab69fa --- /dev/null +++ b/glob.py @@ -0,0 +1,109 @@ +import os +import logging +from ruamel import yaml +from whoosh import fields +from whoosh import analysis +import jinja2 +from slugify import slugify +import arrow + +schema = fields.Schema( + url=fields.ID( + stored=True, + ), + title=fields.TEXT( + stored=True, + analyzer=analysis.FancyAnalyzer( + ) + ), + date=fields.DATETIME( + stored=True, + sortable=True + ), + content=fields.TEXT( + stored=True, + analyzer=analysis.FancyAnalyzer( + ) + ), + tags=fields.TEXT( + stored=True, + analyzer=analysis.KeywordAnalyzer( + lowercase=True, + commas=True + ) + ), + weight=fields.NUMERIC( + sortable=True + ), + img=fields.TEXT( + stored=True + ) +) + +BASEDIR = os.path.dirname(os.path.abspath(__file__)) +CONFIG = os.path.abspath(os.path.join(BASEDIR, 'config.yml')) + +with open(CONFIG, 'r') as c: + conf = yaml.safe_load(c) + conf['site']['author'] = conf['author'] + c.close() + +secrets = os.path.abspath(os.path.join(BASEDIR, 'secret.yml')) +if os.path.isfile(secrets): + with open(secrets, 'r') as c: + conf['secrets'] = yaml.safe_load(c) + c.close() + +CACHEENABLED = True +REGENERATE = False +FORCEWRITE = False + +ISODATE = '%Y-%m-%dT%H:%M:%S%z' + +SOURCE = os.path.abspath(conf['dirs']['source']['root']) +CONTENT = os.path.abspath(conf['dirs']['source']['content']) +FONT = os.path.abspath(conf['dirs']['font']) +STHEME = os.path.abspath(conf['dirs']['source']['theme']) +SFILES = os.path.abspath(conf['dirs']['source']['files']) +TEMPLATES = os.path.abspath(conf['dirs']['source']['templates']) +COMMENTS = os.path.abspath(conf['dirs']['source']['comments']) + +TARGET = os.path.abspath(conf['dirs']['target']['root']) +TTHEME = os.path.abspath(conf['dirs']['target']['theme']) +TFILES = os.path.abspath(conf['dirs']['target']['files']) +UFILES = conf['dirs']['target']['furl'] + +CACHE = os.path.abspath(conf['dirs']['cache']) +SEARCHDB = os.path.abspath(conf['dirs']['searchdb']) + +WEBMENTIONDB = os.path.abspath(conf['webmentiondb']) +LOGDIR = os.path.abspath(conf['dirs']['log']) +GPSDIR = os.path.abspath(conf['dirs']['gps']) +TSDBDIR = os.path.abspath(conf['dirs']['tsdb']) +LOCALCOPIES = os.path.abspath(conf['dirs']['localcopies']) + +lastrun = '/tmp/generator_last_run' + +os.environ.setdefault('PYPANDOC_PANDOC', '/usr/bin/pandoc') + +def jinja_filter_date(d, form='%Y-%m-%d %H:%m:%S'): + if d == 'now': + return arrow.now().strftime(form) + if form == 'c': + form = '%Y-%m-%dT%H:%M:%S%z' + return d.strftime(form) + +def jinja_filter_slugify(s): + return slugify(s, only_ascii=True, lower=True) + +def jinja_filter_search(s, r): + if r in s: + return True + return False + +jinjaldr = jinja2.FileSystemLoader(searchpath=TEMPLATES) +jinja2env = jinja2.Environment(loader=jinjaldr) + +jinja2env.filters['date'] = jinja_filter_date +jinja2env.filters['search'] = jinja_filter_search +jinja2env.filters['slugify'] = jinja_filter_slugify \ No newline at end of file diff --git a/img.py b/img.py new file mode 100644 index 0000000..3156f78 --- /dev/null +++ b/img.py @@ -0,0 +1,370 @@ +import os +import re +import sys +import json +import shutil +import collections +import logging +import imghdr +from ctypes import c_void_p, c_size_t +import glob +import pyexifinfo +from similar_text import similar_text +from cache import Cached +import wand.api +import wand.image +import wand.drawing +import wand.color +from PIL import Image +#from subprocess import call + +# https://stackoverflow.com/questions/34617422/how-to-optimize-image-size-using-wand-in-python +wand.api.library.MagickSetCompressionQuality.argtypes = [c_void_p, c_size_t] + + +class ImageHandler(object): + def __init__(self, fpath, alttext='', title='', imgcl='', linkto=False): + + self.fpath = os.path.abspath(fpath) + path, fname = os.path.split(self.fpath) + fname, ext = os.path.splitext(fname) + self.fname = fname + self.fext = ext + self.ftime = os.stat(self.fpath) + self.linkto = linkto + + self.alttext = alttext + self.title = title + self.imgcl = imgcl + + self.c = os.path.join(glob.TFILES, self.fname) + self.u = "%s/%s/%s" % (glob.conf['site']['url'],glob.UFILES, self.fname) + + self.what = imghdr.what(self.fpath) + + self.meta = {} + + self.exif = {} + if self.what == 'jpeg': + self._setexif() + + self.watermark = '' + wfile = os.path.join(glob.SOURCE, glob.conf['watermark']) + if os.path.isfile(wfile): + self.watermark = wfile + + sizes = { + 90: { + 'ext': 's', + 'cropped': True, + }, + 360: { + 'ext': 'm', + }, + #540: 'n', + 720: { + 'ext': 'z', + }, + #980: 'c', + 1280: { + 'ext': 'b', + } + } + self.sizes = collections.OrderedDict(sorted(sizes.items(), reverse=0)) + + for size, meta in self.sizes.items(): + meta['path'] = "%s_%s%s" % (self.c, meta['ext'], self.fext) + meta['url'] = "%s_%s%s" % (self.u, meta['ext'], self.fext) + meta['mime'] = "image/%s" % (self.what) + + + self._setmeta() + self.fallbacksize = 720 + self.srcsetmin = 720 + + self._is_photo() + + if self.is_photo: + self.srcset = self.mksrcset(generate_caption=False, uphoto=False) + + + def _setmeta(self): + s = collections.OrderedDict(reversed(list(self.sizes.items()))) + for size, meta in s.items(): + if os.path.isfile(meta['path']): + with Image.open(meta['path']) as im: + meta['width'], meta['height'] = im.size + meta['size'] = os.path.getsize(meta['path']) + self.meta = meta + break + + + def downsize(self, liquidcrop=True, watermark=True): + if not self._is_downsizeable(): + return self._copy() + + if not self._isneeded(): + logging.debug("downsizing not needed for %s", self.fpath) + return + + logging.debug("downsizing %s", self.fpath) + try: + img = wand.image.Image(filename=self.fpath) + img.auto_orient() + except: + print("Unexpected error:", sys.exc_info()[0]) + raise + + # watermark + if self.is_photo and self.watermark and img.format == "JPEG" and watermark: + img = self._watermark(img) + + elif self.linkto: + img = self._sourceurlmark(img) + + # resize & cache + for size, meta in self.sizes.items(): + self._intermediate(img, size, meta) + + self._setmeta() + + + def _setexif(self): + cached = Cached(text=self.fname, stime=self.ftime) + cexif = cached.get() + + if cexif: + self.exif = json.loads(cexif) + else: + exif = pyexifinfo.get_json(self.fpath) + self.exif = exif.pop() + cached.set(json.dumps(self.exif)) + + + def _is_photo(self): + self.is_photo = False + if 'cameras' in glob.conf: + if 'EXIF:Model' in self.exif: + if self.exif['EXIF:Model'] in glob.conf['cameras']: + self.is_photo = True + + if 'copyright' in glob.conf: + if 'IPTC:CopyrightNotice' in self.exif: + for s in glob.conf['copyright']: + pattern = re.compile(r'%s' % s) + if pattern.search(self.exif['IPTC:CopyrightNotice']): + self.is_photo = True + + if self.is_photo: + #self.category = "photo" + + if not self.alttext: + keywords = ['XMP:Description', 'IPTC:Caption-Abstract'] + for key in keywords: + if key in self.exif and self.exif[key]: + self.alttext = self.exif[key] + break + + if not self.title: + keywords = ['XMP:Title', 'XMP:Headline', 'IPTC:Headline'] + for key in keywords: + if key in self.exif and self.exif[key]: + self.title = self.exif[key] + break + + + def _is_downsizeable(self): + if self.what != 'jpeg' and self.what != 'png': + return False + if self.imgcl: + return False + return True + + + def _watermark(self, img): + wmark = wand.image.Image(filename=self.watermark) + + if img.width > img.height: + w = img.width * 0.16 + h = wmark.height * (w / wmark.width) + x = img.width - w - (img.width * 0.01) + y = img.height - h - (img.height * 0.01) + else: + w = img.height * 0.16 + h = wmark.height * (w / wmark.width) + x = img.width - h - (img.width * 0.01) + y = img.height - w - (img.height * 0.01) + + w = round(w) + h = round(h) + x = round(x) + y = round(y) + + wmark.resize(w, h) + if img.width < img.height: + wmark.rotate(-90) + img.composite(image=wmark, left=x, top=y) + return img + + + def _sourceurlmark(self, img): + with wand.drawing.Drawing() as draw: + draw.fill_color = wand.color.Color('#fff') + draw.fill_opacity = 0.8 + draw.stroke_color = wand.color.Color('#fff') + draw.stroke_opacity = 0.8 + r_h = round(img.height * 0.3) + r_top = round((img.height/2) - (r_h/2)) + + draw.rectangle( + left=0, + top=r_top, + width=img.width, + height=r_h + ) + + draw(img) + + with wand.drawing.Drawing() as draw: + draw.font = os.path.join(glob.FONT) + draw.font_size = round((img.width)/len(self.linkto)*1.5) + draw.gravity = 'center' + draw.text( + 0, + 0, + self.linkto + ) + draw(img) + return img + + + def _copy(self): + p = self.c + self.fext + if not os.path.isfile(p): + logging.debug("copying %s" % self.fpath) + shutil.copy(self.fpath, p) + return + + + def _isneeded(self): + # skip existing + needed = False + if glob.REGENERATE: + needed = True + else: + for size, meta in self.sizes.items(): + if not os.path.isfile(meta['path']): + needed = True + + return needed + + + def _intermediate_dimensions(self, img, size, meta): + if (img.width > img.height and 'crop' not in meta) \ + or (img.width < img.height and 'crop' in meta): + width = size + height = int(float(size / img.width) * img.height) + else: + height = size + width = int(float(size / img.height) * img.width) + + return (width, height) + + + def _intermediate_symlink(self, meta): + # create a symlink to the largest resize with the full filename; + # this is to ensure backwards compatibility and avoid 404s + altsrc = meta['path'] + altdst = self.c + self.fext + + if not os.path.islink(altdst): + if os.path.isfile(altdst): + os.unlink(altdst) + os.symlink(altsrc, altdst) + + + def _intermediate(self, img, size, meta): + # skip existing unless regenerate needed + if os.path.isfile(meta['path']) and not glob.REGENERATE: + return + + # too small images: move on + #if size > img.height and size > img.width: + # return + width, height = self._intermediate_dimensions(img, size, meta) + + try: + thumb = img.clone() + thumb.resize(width, height) + #thumb.resize(width, height, filter='robidouxsharp') + + if 'crop' in meta and liquidcrop: + thumb.liquid_rescale(size, size, 1, 1) + elif 'crop' in meta: + l = t = 0 + if width > size: + l = int((width - size) / 2) + if height > size: + t = int((height - size) / 2) + thumb.crop(left=l, top=t, width=size, height=size) + + if img.format == "PNG": + library.MagickSetCompressionQuality(img.wand, 75) + + if img.format == "JPEG": + thumb.compression_quality = 86 + thumb.unsharp_mask(radius=0, sigma=0.5, amount=1, threshold=0.03) + thumb.format = 'pjpeg' + + # this is to make sure pjpeg happens + with open(meta['path'], 'wb') as f: + thumb.save(file=f) + + if size == list(self.sizes.keys())[-1]: + self._intermediate_symlink(meta) + + #if img.format == "JPEG": + ## this one strips the embedded little jpg + #call(['/usr/bin/jhead', '-dt', '-q', cpath]) + + except: + print("Unexpected error:", sys.exc_info()[0]) + raise + + + def mksrcset(self, generate_caption=True, uphoto=False): + if not self._is_downsizeable(): + return False + + for size, meta in self.sizes.items(): + if 'crop' in meta: + continue + + # increase fallback until max fallback reached + if size <= self.fallbacksize: + fallback = meta['url'] + + # set target for the largest + target = meta['url'] + + if uphoto: + uphotoclass=' u-photo' + else: + uphotoclass='' + caption = '' + + if not self.imgcl: + cl = '' + else: + cl = self.imgcl + + if self.alttext \ + and similar_text(self.alttext, self.fname) < 90 \ + and similar_text(self.alttext, self.fname + '.' + self.fext) < 90 \ + and generate_caption: + caption = '
%s
' % (self.alttext) + + if self.linkto: + target = self.linkto + + return '
%s%s
' % (uphotoclass, target, fallback, self.imgcl, self.alttext, caption) \ No newline at end of file diff --git a/new.py b/new.py new file mode 100755 index 0000000..d1f0503 --- /dev/null +++ b/new.py @@ -0,0 +1,203 @@ +#!/home/petermolnar.net/.venv/bin/python3.5 + +"""Usage: new.py [-h] [-t TAGS] [-d DATE] [-s SLUG] [-l TITLE] [-b BOOKMARK] [-r REPLY] [-p REPOST] [-c CONTENT] [-u SUMMARY] [-i REDIRECT] [-a CATEGORY] + +-h --help show this +-t --tags TAGS ';' separated, quoted list of tags +-d --date DATE YYYY-mm-ddTHH:MM:SS+TZTZ formatted date, if not now +-s --slug SLUG slug (normally autogenerated from title or pubdate) +-l --title TITLE title of new entry +-b --bookmark BOOKMARK URL to bookmark +-r --reply REPLY URL to reply to +-p --repost REPOST URL to repost +-c --content CONTENT content of entry +-u --summary SUMMARY summary of entry +-i --redirect REDIRECT ';' separated, quoted list of redirects +-a --category CATEGORY to put the content in this category +""" + +import os +import sys +import datetime +import calendar +import logging +import json +import glob +import iso8601 +import pytz +from docopt import docopt +from slugify import slugify +from ruamel import yaml +import singular + +class ContentCreator(object): + def __init__( + self, + category='note', + tags=[], + date='', + slug='', + title='', + bookmark='', + reply='', + repost='', + content='', + summary='', + redirect=[] + ): + self.category = category + + if date: + self.date = iso8601.parse_date(date) + else: + self.date = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) + self.time = calendar.timegm(self.date.timetuple()) + + self.title = title + + if slug: + self.slug = slug + elif title: + self.slug = slugify(title, only_ascii=True, lower=True) + else: + self.slug = singular.SingularHandler.baseN(self.time) + + self.tags = tags + self.bookmark = bookmark + self.reply = reply + self.repost = repost + if content: + self.content = content + else: + self.content = '' + self.summary = summary + self.redirect = redirect + + self._makeyaml() + self._write() + + + def _makeyaml(self): + self.yaml = { + 'published': self.date.strftime("%Y-%m-%dT%H:%M:%S%z") + } + + if self.title: + self.yaml['title'] = self.title + + if self.tags: + self.yaml['tags'] = self.tags + + if self.bookmark: + self.yaml['bookmark-of'] = self.bookmark + + if self.repost: + self.yaml['repost-of'] = self.repost + + if self.reply: + self.yaml['in-reply-to'] = self.reply + + if self.summary: + self.yaml['summary'] = self.summary + + if self.redirect: + self.yaml['redirect'] = self.redirect + + def _write(self): + fdir = os.path.join(glob.CONTENT, self.category) + if not os.path.isdir(fdir): + sys.exit("there is no category %s" % (self.category)) + + self.fpath = os.path.join(glob.CONTENT, self.category, "%s.md" % (self.slug)) + self.out = "---\n" + yaml.dump(self.yaml, Dumper=yaml.RoundTripDumper) + "---\n\n" + self.content + with open(self.fpath, "w") as archive: + logging.info("writing %s", self.fpath) + logging.info("contents: %s", self.out) + archive.write(self.out) + archive.close() + + +class ParseCMDLine(object): + def __init__(self, arguments): + for x in ['--redirect', '--tags']: + if x in arguments and arguments[x]: + arguments[x] = arguments[x].split(";") + + self.entry = ContentCreator( + category=arguments['--category'], + tags=arguments['--tags'], + date=arguments['--date'], + slug=arguments['--slug'], + title=arguments['--title'], + bookmark=arguments['--bookmark'], + reply=arguments['--reply'], + repost=arguments['--repost'], + content=arguments['--content'], + summary=arguments['--summary'], + redirect=arguments['--redirect'] + ) + +if __name__ == '__main__': + args = docopt(__doc__, version='new.py 0.1') + + with open(os.path.join(glob.CACHE, "slugs.json")) as sf: + slugs = json.loads(sf.read()) + sf.close() + + if not args['--category']: + c = 'note' + args['--category'] = input('Category [%s]: ' % (c)) or c + + if not args['--date']: + d = datetime.datetime.utcnow().replace(tzinfo=pytz.utc).strftime("%Y-%m-%dT%H:%M:%S%z") + args['--date'] = input('Date [%s]' % (d)) or d + + if not args['--title']: + args['--title'] = input('Title []:') or '' + + if not args['--tags']: + args['--tags'] = input('Tags (separated by ;, no whitespace) []:') or [] + + if not args['--bookmark']: + args['--bookmark'] = input('Bookmark of URL []:') or '' + + if not args['--reply']: + args['--reply'] = input('Reply to URL []:') or '' + + if not args['--repost']: + args['--repost'] = input('Repost of URL []:') or '' + + if not args['--slug']: + if args['--title']: + slug = slugify(args['--title'], only_ascii=True, lower=True) + elif args['--bookmark']: + slug = slugify("re: %s" % (args['--bookmark']), only_ascii=True, lower=True) + elif args['--reply']: + slug = slugify("re: %s" % (args['--reply']), only_ascii=True, lower=True) + elif args['--repost']: + slug = slugify("re: %s" % (args['--repost']), only_ascii=True, lower=True) + else: + d = iso8601.parse_date(args['--date']) + t = calendar.timegm(d.timetuple()) + slug = singular.SingularHandler.baseN(t) + args['--slug'] = input('Slug [%s]:' % (slug)) or slug + + if args['--slug'] in slugs: + logging.warning("This slug already exists: %s", args['--slug']) + slugbase = args['--slug'] + inc = 1 + while args['--slug'] in slugs: + args['--slug'] = "%s-%d" % (slugbase, inc) + inc = inc+1 + logging.warning("Using %s as slug", args['--slug']) + + if not args['--summary']: + args['--summary'] = input('Summary []:') or '' + + if not args['--content']: + args['--content'] = input('Content []:') or '' + + if not args['--redirect']: + args['--reditect'] = input('Additional slugs (separated by ;, no whitespace) []:') or [] + + p = ParseCMDLine(args) \ No newline at end of file diff --git a/receiver.py b/receiver.py new file mode 100644 index 0000000..822eb21 --- /dev/null +++ b/receiver.py @@ -0,0 +1,850 @@ +import glob +import asyncio +import uvloop +import os +from sanic import Sanic +import sanic.response +from sanic.log import log as logging +from whoosh import index, qparser +import pynmea2 +import datetime +import pytz +import re +import validators +import requests +import pypandoc +import hashlib +import time +from webmentiontools import urlinfo +import json +import calendar +import mimetypes +import singular +import urllib.parse +from ruamel import yaml +from slugify import slugify +import smtplib +import iso8601 +import csv +import shutil +import collections +from git import Repo, Actor +import frontmatter +#import gzip +import arrow + +class ToEmail(object): + def __init__(self, webmention): + self.webmention = webmention + self.set_html() + self.set_headers() + + + def set_html(self): + for authormeta in ['email', 'name', 'url']: + if not authormeta in self.webmention['author']: + self.webmention['author'][authormeta] = '' + + html = """ + + + +

+ New %s +

+
+
From
+
+ %s
+ %s +
+
Source
+
%s
+
Target
+
%s
+
+ %s + + """ % ( + self.webmention['type'], + self.webmention['author']['url'], + self.webmention['author']['name'], + self.webmention['author']['email'], + self.webmention['author']['email'], + self.webmention['source'], + self.webmention['source'], + self.webmention['target'], + self.webmention['target'], + pypandoc.convert_text( + self.webmention['content'], + to='html5', + format="markdown+" + "+".join([ + 'backtick_code_blocks', + 'auto_identifiers', + 'fenced_code_attributes', + 'definition_lists', + 'grid_tables', + 'pipe_tables', + 'strikeout', + 'superscript', + 'subscript', + 'markdown_in_html_blocks', + 'shortcut_reference_links', + 'autolink_bare_uris', + 'raw_html', + 'link_attributes', + 'header_attributes', + 'footnotes', + ]) + ) + ) + self.html = html + + def set_headers(self): + """ Create and send email from a parsed webmention """ + + self.headers = { + 'Content-Type': 'text/html; charset=utf-8', + 'Content-Disposition': 'inline', + 'Content-Transfer-Encoding': '8bit', + 'Date': self.webmention['date'].strftime('%a, %d %b %Y %H:%M:%S %Z'), + 'X-WEBMENTION-SOURCE': self.webmention['source'], + 'X-WEBMENTION-TARGET': self.webmention['target'], + 'From': glob.conf['from']['address'], + 'To': glob.conf['to']['address'], + 'Subject': "[webmention] from %s to %s" % ( self.webmention['source'], self.webmention['target'] ), + } + + + def send(self): + msg = '' + for key, value in self.headers.items(): + msg += "%s: %s\n" % ( key, value ) + + msg += "\n%s\n" % self.html + + try: + s = smtplib.SMTP( glob.conf['smtp']['host'], glob.conf['smtp']['port'] ) + if glob.conf['smtp']['tls']: + s.ehlo() + s.starttls() + s.ehlo() + + if glob.conf['smtp']['username'] and glob.conf['smtp']['password']: + s.login(glob.conf['smtp']['username'], glob.conf['smtp']['password']) + + s.sendmail( self.headers['From'], [ self.headers['To'] ], msg.encode("utf8") ) + s.quit() + except: + print("Unexpected error:", sys.exc_info()[0]) + raise + + +class MicropubHandler(object): + def __init__(self, request): + self.request = request + self.response = sanic.response.text("Unhandled error", status=500) + + self.slug = '' + self.content = '' + self.category = 'note' + self.meta = {} + self.dt = datetime.datetime.now().replace(tzinfo=pytz.utc) + + logging.debug("incoming micropub request:") + logging.debug(self.request.body) + + logging.debug("** args:") + logging.debug(self.request.args) + + logging.debug("** query string:") + logging.debug(self.request.query_string) + + logging.debug("** headers:") + logging.debug(self.request.headers) + + with open(os.path.join(glob.CACHE, "tags.json"), "r") as db: + self.existing_tags = json.loads(db.read()) + db.close() + + self._parse() + + def _verify(self): + if 'q' in self.request.args: + if 'config' in self.request.args['q']: + self.response = sanic.response.json({ + 'tags': self.existing_tags + }, status=200) + return + if 'syndicate-to' in self.request.args['q']: + self.response = sanic.response.json({ + 'syndicate-to': [] + }, status=200) + return + + if not 'access_token' in self.request.form: + self.response = sanic.response.text("Mising access token", status=401) + return + + token = self.request.form.get('access_token') + + verify = requests.get( + 'https://tokens.indieauth.com/token', + allow_redirects=False, + timeout=10, + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Authorization': 'Bearer %s' % (token) + }); + + if verify.status_code != requests.codes.ok: + self.response = sanic.response.text("Could not verify access token", status=500) + return False + + response = urllib.parse.parse_qs(verify.text) + logging.debug(response) + if 'scope' not in response or 'me' not in response: + self.response = sanic.response.text("Could not verify access token", status=401) + return False + + if '%s/' % (glob.conf['site']['url'].rstrip()) not in response['me']: + self.response = sanic.response.text("You can't post to this domain.", status=401) + return False + + if 'post' not in response['scope'] and 'create' not in response['scope']: + self.response = sanic.response.text("Invalid scope", status=401) + return False + + return True + + def _parse(self): + if not self._verify(): + return + + if len(self.request.files): + self.response = sanic.response.text("File handling is not yet done", status=501) + return + #for ffield in self.request.files.keys(): + #logging.info("got file field: %s" % ffield) + #f = self.request.files.get(ffield) + #logging.info("mime is: %s" % f.type) + #logging.info("ext should be: %s" % mimetypes.guess_extension(f.type)) + + ##f.body + ##f.type + ##logging.info( f ) + + self.meta['published'] = self.dt.strftime('%Y-%m-%dT%H:%M:%S%z') + + slug = None + + if 'content' in self.request.form and len(self.request.form.get('content')): + self.content = self.request.form.get('content') + + if 'summary' in self.request.form and len(self.request.form.get('summary')): + self.meta['summary'] = self.request.form.get('summary') + + if 'slug' in self.request.form and len(self.request.form.get('slug')): + slug = self.request.form.get('slug') + + if 'name' in self.request.form and len(self.request.form.get('name')): + self.meta['title'] = self.request.form.get('name') + if not slug: + slug = self.meta['title'] + + if 'in-reply-to' in self.request.form and len(self.request.form.get('in-reply-to')): + self.meta['in-reply-to'] = self.request.form.get('in-reply-to') + if not slug: + slug = 're: %s', self.meta['in-reply-to'] + + if 'repost-of' in self.request.form and len(self.request.form.get('repost-of')): + self.meta['repost-of'] = self.request.form.get('repost-of') + category = 'bookmark' + if not slug: + slug = '%s', self.meta['repost-of'] + + if 'bookmark-of' in self.request.form and len(self.request.form.get('bookmark-of')): + self.meta['bookmark-of'] = self.request.form.get('bookmark-of') + self.category = 'bookmark' + if not slug: + slug = '%s', self.meta['bookmark-of'] + + if 'category[]' in self.request.form: + self.meta['tags'] = list(self.request.form['category[]']) + if 'summary' in self.meta and ('IT' in self.meta['tags'] or 'it' in self.meta['tags']): + self.category = 'article' + elif 'summary' in self.meta and ('journal' in self.meta['tags'] or 'journal' in self.meta['tags']): + self.category = 'journal' + + if not slug: + slug = singular.SingularHandler.baseN(calendar.timegm(self.dt.timetuple())) + + self.slug = slugify(slug, only_ascii=True, lower=True) + self._write() + + def _write(self): + fpath = os.path.join(glob.CONTENT, self.category, '%s.md' % (self.slug)) + if os.path.isfile(fpath): + self.response = sanic.response.text("Update handling is not yet done", status=501) + return + + logfile = os.path.join(glob.LOGDIR, "micropub-%s.log" % (self.dt.strftime("%Y-%m"))) + with open (logfile, 'a') as micropublog: + logging.debug("logging micropub request") + micropublog.write("%s %s\n" % (self.dt.strftime('%Y-%m-%dT%H:%M:%S%z'), fpath)) + micropublog.close() + + with open (fpath, 'w') as mpf: + logging.info("writing file to: %s", fpath) + out = "---\n" + yaml.dump(self.meta, Dumper=yaml.RoundTripDumper, allow_unicode=True, indent=4) + "---\n\n" + self.content + mpf.write(out) + mpf.close() + + self._git(fpath) + + logging.info("trying to open and parse the received post") + post = singular.ArticleHandler(fpath, category=self.category) + post.write() + post.pings() + + self.response = sanic.response.text( + "Post created", + status = 201, + headers = { + 'Location': "%s/%s/" % (glob.conf['site']['url'], self.slug) + } + ) + + return + + def _git(self, fpath): + logging.info("committing to git") + repo = Repo(glob.CONTENT) + author = Actor(glob.conf['author']['name'], glob.conf['author']['email']) + index = repo.index + newfile = fpath.replace(glob.CONTENT, '').lstrip('/') + index.add([newfile]) + message = 'new content via micropub: %s' % (newfile) + index.commit(message, author=author, committer=author) + + +class SearchHandler(object): + def __init__ (self, query): + self.query = query + self.response = sanic.response.text("You seem to have forgot to enter what you want to search for. Please try again.", status=400) + + if not query: + return + + self._tmpl = glob.jinja2env.get_template('searchresults.html') + self._ix = index.open_dir(glob.SEARCHDB) + self._parse() + + def _parse(self): + self.query = self.query.replace('+', ' AND ') + self.query = self.query.replace(' -', ' NOT ') + qp = qparser.MultifieldParser( + ["title", "content", "tags"], + schema = glob.schema + ) + q = qp.parse(self.query) + r = self._ix.searcher().search(q, sortedby="weight", limit=100) + logging.info("results for '%s': %i", self.query, len(r)) + results = [] + for result in r: + res = { + 'title': result['title'], + 'url': result['url'], + 'highlight': result.highlights("content"), + } + + if 'img' in result: + res['img'] = result['img'] + + results.append(res) + + tvars = { + 'term': self.query, + 'site': glob.conf['site'], + 'posts': results, + 'taxonomy': {} + } + logging.info("collected %i results to render", len(results)) + html = self._tmpl.render(tvars) + self.response = sanic.response.html(html, status=200) + + +class WebmentionHandler(object): + def __init__ ( self, source, target ): + self.source = source + self.target = target + self.time = arrow.utcnow().timestamp + logging.debug("validating: from: %s; to: %s" % (self.source, self.target) ) + self.response = sanic.response.json({ + 'status': 'ok','msg': 'accepted', + }, 200) + self._validate() + self._parse() + self._archive() + self._send() + + def _validate(self): + if not validators.url(self.source): + self.response = sanic.response.json({ + 'status': 'error','msg': '"souce" parameter is an invalid URL', + }, 400) + return + + if not validators.url(self.target): + self.response = sanic.response.json({ + 'status': 'error','msg': '"target" parameter is an invalid URL', + }, 400) + return + + _target = urllib.parse.urlparse(self.target) + _target_domain = '{uri.netloc}'.format(uri=_target) + + if not _target_domain in glob.conf['accept_domains']: + self.response = sanic.response.json({ + 'status': 'error', + 'msg': "%s' is not in the list of allowed domains" % ( + _target_domain + ) + }, 400) + return + + _source = urllib.parse.urlparse(self.source) + _source_domain = '{uri.netloc}'.format(uri=_source) + + if _source_domain == _target_domain and not glob.conf['allow_selfmention']: + self.response = sanic.response.json({ + 'status': 'error', + 'msg': "selfpings are disabled" + }, 400) + return + + return + + def _parse(self): + if self.response.status != 200: + return + + self._log() + self._source = urlinfo.UrlInfo(self.source) + if self._source.error: + logging.warning( "couldn't fetch %s; dropping webmention" % (self.source)) + return + self.source = self._source.realurl + if not self._source.linksTo(self.target): + logging.warning( "%s is not linking to %s; dropping webmention" % (self.source, self.target)) + return + + self._target = urlinfo.UrlInfo(self.target) + if self._target.error: + logging.warning( "couldn't fetch %s; dropping webmention" % (self.target)) + return + self.target = self._target.realurl + + self.webmention = { + 'author': self._source.author(), + 'type': self._source.relationType(), + 'target': self.target, + 'source': self.source, + 'date': arrow.get(self._source.pubDate()), + 'content': pypandoc.convert_text( + self._source.content(), + to="markdown-" + "-".join([ + 'raw_html', + 'native_divs', + 'native_spans', + ]), + format='html' + ) + } + + + def _send(self): + if self.response.status != 200: + return + + m = ToEmail(self.webmention) + m.send() + + + def _archive(self): + if self.response.status != 200: + return + + fbase = self.webmention['date'].format('YYYY-MM-DD-HH-mm-ss') + fpath = self._archive_name(fbase) + + archive = dict(self.webmention) + archive['date'] = archive['date'].format('YYYY-MM-DDTHH.mm.ssZ') + content = archive['content'] + del(archive['content']) + + with open (fpath, 'w') as f: + logging.info("writing file to: %s", fpath) + out = "---\n" + yaml.dump( + archive, + Dumper=yaml.RoundTripDumper, + allow_unicode=True, + indent=4 + ) + "---\n\n" + content + f.write(out) + f.close() + + def _verify_archive(self, p): + archive = frontmatter.load(p) + + if 'target' not in archive.metadata: + logging.warning('missing target') + return False + + if 'source' not in archive.metadata: + logging.warning('missing source') + return False + + if 'date' not in archive.metadata: + logging.warning('missing date') + return False + + if archive.metadata['target'] != self.webmention['target']: + logging.warning('target different') + return False + + if archive.metadata['source'] != self.webmention['source']: + logging.warning('source different') + return False + + d = arrow.get(archive.metadata['date']) + + if d.timestamp != self.webmention['date'].timestamp: + logging.warning('date different') + return False + + # overwrite + return True + + def _archive_name(self, archive, ext='.md'): + p = os.path.join(glob.COMMENTS, "%s%s" % (archive, ext)) + + if not os.path.exists(p): + logging.debug("%s doesn't exits yet" % p) + return p + + logging.debug("%s exists, checking for update" % p) + if self._verify_archive(p): + return p + + # another comment with the exact same second? wy not. + names = [x for x in os.listdir(glob.COMMENTS) if x.startswith(archive)] + suffixes = [x.replace(archive, '').replace(ext, '').replace('.','') for x in names] + indexes = [int(x) for x in suffixes if x and set(x) <= set('0123456789')] + idx = 1 + if indexes: + idx += sorted(indexes)[-1] + + return os.path.join(glob.COMMENTS, "%s.%d%s" % (archive, idx, ext)) + + def _log(self): + if not os.path.isdir(glob.LOGDIR): + os.mkdir (glob.LOGDIR) + + logfile = os.path.join(glob.LOGDIR, datetime.datetime.now().strftime("%Y-%m")) + s = json.dumps({ + 'time': self.time, + 'source': self.source, + 'target': self.target + }) + + with open(logfile, "a") as log: + logging.debug( "writing logfile %s with %s" % (logfile, s)) + log.write("%s\n" % (s)) + log.close() + + +class TimeSeriesHandler(object): + def __init__(self, tag): + if not os.path.isdir(glob.TSDBDIR): + os.mkdir(glob.TSDBDIR) + + self.tag = tag + self.p = os.path.join(glob.TSDBDIR, '%s.csv' % (self.tag)) + self.db = {} + + #def _loaddb(self): + #if not os.path.isfile(self.p): + #return + + #pattern = re.compile(r'^([0-9-\+:T]+)\s+(.*)$') + #searchfile = open(self.p, 'r') + #for line in searchfile: + #matched = re.match(pattern, line) + #if not matched: + #continue + + #epoch = int(iso8601.parse_date(matched.group(1)).replace(tzinfo=pytz.utc).strftime('%s')) + #data = matched.group(2) + #self.db[epoch] = data + #searchfile.close() + + #def _dumpdb(self): + #lines = [] + #for e in self.db.items(): + #epoch, data = e + #tstamp = datetime.datetime.utcfromtimestamp(epoch).replace(tzinfo=pytz.utc).strftime(glob.ISODATE) + #line = '%s %s' % (tstamp, data) + #lines.append(line) + + #bkp = '%s.bkp' % (self.p) + #shutil.copy(self.p, bkp) + #with open(self.p, "w") as searchfile: + + #searchfile.write() + #del(cr) + #csvfile.close() + #os.unlink(bkp) + + @staticmethod + def _common_date_base(d1, d2): + d1 = d1.replace(tzinfo=pytz.utc).strftime(glob.ISODATE) + d2 = d2.replace(tzinfo=pytz.utc).strftime(glob.ISODATE) + l = len(d1) + common = '' + for i in range(l): + if d1[i] == d2[i]: + common = common + d1[i] + else: + break + return common + + def search(self, when, tolerance=1800): + when = when.replace(tzinfo=pytz.utc) + tolerance = int(tolerance/2) + minwhen = when - datetime.timedelta(seconds=tolerance) + maxwhen = when + datetime.timedelta(seconds=tolerance) + + closest = None + mindiff = float('inf') + common = TimeSeriesHandler._common_date_base(minwhen, maxwhen) + pattern = re.compile(r'^(%s[0-9-\+:T]+)\s+(.*)$' % (common)) + searchfile = open(self.p, 'r') + for line in searchfile: + matched = re.match(pattern, line) + if not matched: + continue + + d = iso8601.parse_date(matched.group(1)) + diff = d - when + diff = abs(diff.total_seconds()) + if diff >= mindiff: + continue + + mindiff = diff + closest = (d, matched.group(2)) + searchfile.close() + return closest + + def append(self, data, dt=datetime.datetime.now().replace(tzinfo=pytz.utc)): + if os.path.isfile(self.p): + epoch = int(dt.strftime('%s')) + stat = os.stat(self.p) + if epoch < stat.st_mtime: + logging.warning('Refusing to append %s with old data' % self.p) + return + + with open(self.p, 'a') as db: + db.write("%s %s\n" % ( + dt.strftime(glob.ISODATE), + data + )) + + +class DataHandler(object): + def __init__(self, request): + self.request = request + self.dt = datetime.datetime.now().replace(tzinfo=pytz.utc) + self.response = sanic.response.text('accepted',status=200) + + if not 'secrets' in glob.conf or \ + not 'devices' in glob.conf['secrets']: + self.response = sanic.response.text( + 'server configuration error', + status=501 + ) + return + + if 'id' not in self.request.args: + self.response = sanic.response.text( + 'device id not found in request', + status=401 + ) + return + + id = self.request.args.get('id') + if id not in glob.conf['secrets']['devices'].keys(): + self.response = sanic.response.text( + 'device id rejected', + status=401 + ) + return + + self.id = glob.conf['secrets']['devices'][id] + +class OpenGTSHandler(DataHandler): + def __init__(self, *args, **kwargs): + super(OpenGTSHandler, self).__init__(*args, **kwargs) + self.lat = 0 + self.lon = 0 + self.alt = 0 + self._parse() + self.l = '%s 0' % (self.dt.strftime(glob.ISODATE)) + + def _parse(self): + logging.debug('--- incoming location request ---') + logging.debug(self.request.args) + + if 'latitude' in self.request.args and 'longitude' in self.request.args: + self.lat = float(self.request.args.get('latitude')) + self.lon = float(self.request.args.get('longitude')) + elif 'gprmc' in self.request.args: + gprmc = pynmea2.parse(self.request.args.get('gprmc')) + try: + self.lat = float(gprmc.latitude) + self.lon = float(gprmc.longitude) + except: + self.response = sanic.response.text( + "could not process gprmc string", + status=422 + ) + return + else: + self.response = sanic.response.text( + "no location information found in query", + status=401 + ) + return + + if 'exclude_coordinates' in glob.conf['secrets']: + excl = {} + for t in ['lat', 'lon']: + excl[t] = [] + if t in glob.conf['secrets']['exclude_coordinates']: + for c in glob.conf['secrets']['exclude_coordinates'][t]: + excl[t].append(float(c)) + + if round(self.lat,2) in excl['lat'] and round(self.lon,2) in excl['lon']: + self.response = sanic.response.text( + "this location is on the excluded list", + status=200 + ) + return + + if 'loc_timestamp' in self.request.args and 'offset' in self.request.args: + # this is a bit ugly: first convert the epoch to datetime + # then append it with the offset as string + # and convert the string back to datetime from the iso8601 string + dt = datetime.datetime.utcfromtimestamp(int(self.request.args.get('loc_timestamp'))) + dt = dt.strftime('%Y-%m-%dT%H:%M:%S') + dt = "%s%s" % (dt, self.request.args.get('offset')) + try: + self.dt = iso8601.parse_date(dt).replace(tzinfo=pytz.utc) + except: + pass + + if 'altitude' in self.request.args: + self.alt = float(self.request.args.get('altitude')) + else: + try: + self.alt = OpenGTSHandler.altitude_from_bing(self.lat, self.lon) + except: + pass + + self.lat = "{:4.6f}".format(float(self.lat)) + self.lon = "{:4.6f}".format(float(self.lon)) + self.alt = "{:4.6f}".format(float(self.alt)) + l = '%s %s %s' % (self.lat, self.lon, self.alt) + + gpsfile = TimeSeriesHandler('location') + gpsfile.append(l, dt=self.dt) + + @staticmethod + def altitude_from_bing(lat, lon): + if 'bing_key' not in glob.conf['secrets']: + return 0 + if not glob.conf['secrets']['bing_key']: + return 0 + + url = "http://dev.virtualearth.net/REST/v1/Elevation/List?points=%s,%s&key=%s" % ( + lat, + lon, + glob.conf['secrets']['bing_key'] + ) + + bing = requests.get(url) + bing = json.loads(bing.text) + if 'resourceSets' not in bing or \ + 'resources' not in bing['resourceSets'][0] or \ + 'elevations' not in bing['resourceSets'][0]['resources'][0] or \ + not bing['resourceSets'][0]['resources'][0]['elevations']: + return 0 + + alt = float(bing['resourceSets'][0]['resources'][0]['elevations'][0]) + del(bing) + del(url) + return alt + + +class SensorHandler(DataHandler): + def __init__(self, *args, **kwargs): + super(SensorHandler, self).__init__(*args, **kwargs) + self.data = 0 + self.tag = '' + self._parse() + + def _parse(self): + logging.debug('--- incoming sensor request ---') + logging.debug(self.request.args) + + for tag in self.request.args: + if tag == 'id': + continue + + datafile = TimeSeriesHandler('%s-%s' % (self.id, tag)) + datafile.append(self.request.args.get(tag), dt=self.dt) + + +asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) +app = Sanic() + +@app.route("/webmention") +async def wm(request, methods=["POST"]): + source = request.form.get('source') + target = request.form.get('target') + r = WebmentionHandler(source, target) + return r.response + +@app.route("/search") +async def search(request, methods=["GET"]): + query = request.args.get('s') + r = SearchHandler(query) + return r.response + +@app.route("/micropub") +async def mpub(request, methods=["POST","GET"]): + r = MicropubHandler(request) + return r.response + +@app.route("/opengts") +async def opengts(request, methods=["GET"]): + r = OpenGTSHandler(request) + return r.response + +@app.route("/sensor") +async def sensor(request, methods=["GET"]): + r = SensorHandler(request) + return r.response + +if __name__ == "__main__": + app.run(host="127.0.0.1", port=8000, debug=True) \ No newline at end of file diff --git a/singular.py b/singular.py new file mode 100644 index 0000000..9277f37 --- /dev/null +++ b/singular.py @@ -0,0 +1,916 @@ +import os +import re +import sys +import collections +import logging +import glob +import img +import pypandoc +import langdetect +from cache import Cached +from slugify import slugify +from ruamel import yaml +from bs4 import BeautifulSoup +import frontmatter +from webmentiondb import WebmentionDB +import arrow +import json +import socket +import requests +import hashlib +import shutil + + +class SingularHandler(object): + + def __init__(self, fpath, pingdb=WebmentionDB(), category='note'): + self.fpath = os.path.abspath(fpath) + path, fname = os.path.split(self.fpath) + fname, ext = os.path.splitext(fname) + self.fname = fname + self.fext = ext + self.ftime = os.stat(self.fpath) + self.target = os.path.join(glob.TARGET, "%s.html" % (self.fname)) + + basedir = os.path.join(glob.TARGET, "%s" % (self.fname)) + if not os.path.isdir(basedir): + os.mkdir(basedir) + + self.saved = os.path.join(glob.TARGET, "%s" % (self.fname), "saved.html") + + self.pingdb = pingdb + self.title = '' + self.content = '' + self._content = '' + self.summary = '' + self.html = '' + self.sumhtml = '' + self.category = category + self.tags = [] + self.reactions = {} + #self.date = datetime.datetime(1970, 1, 1).replace(tzinfo=pytz.utc) + self.date = arrow.get(0) + self.updated = None + self.dtime = 0 + self.utime = 0 + self.redirect = {} + + self.exifmin = {} + self.lang = glob.conf['site']['lang'] + self.syndicate = {} + self.syndications = [] + self.template = 'singular.html' + + self.slug = slugify(self.fname, only_ascii=True, lower=True) + self.shortslug = slugify(self.fname, only_ascii=True, lower=True) + self.img = None + self.srcset = '' + + def __repr__(self): + return "Post '%s' (%s), category: %s" % (self.title,self.fname,self.category) + + + def _postsetup(self): + """ Shared post-setup - the initial thing, such at title, should be + set by the classes inheriting this one; these are only the common, + shared variables """ + + # set published epoch + #self.dtime = calendar.timegm(self.date.timetuple()) + self.dtime = self.date.timestamp + + # set updated epoch, if any and set the original file date according + # to either the updated or the published time + if self.updated: + #self.utime = calendar.timegm(self.updated.timetuple()) + self.utime = self.updated.timestamp + if self.utime > 0 and self.utime != self.ftime.st_mtime: + os.utime(self.fpath, (self.utime, self.utime)) + elif self.dtime > 0 and self.dtime != self.ftime.st_mtime: + os.utime(self.fpath, (self.dtime, self.dtime)) + + # generate shortslug from dtime if possible + if self.dtime > 0: + self.shortslug = SingularHandler.baseN(self.dtime) + self.redirect[self.shortslug] = 1 + + # detect post content language if possible + try: + self.lang = langdetect.detect("%s\n\n%s" % (self.title, self.content)) + except: + pass + + # make HTML from markdown via pandoc for the content and the summary + self.html = SingularHandler.pandoc_md2html( + self.content, + time=self.ftime + ) + self.sumhtml = SingularHandler.pandoc_md2html( + self.summary, + time=self.ftime + ) + + self.url = "%s/%s" % (glob.conf['site']['url'], self.slug) + self.syndications = self.pingdb.posses(self.url) + + #def urlsvg(self): + # import pyqrcode + # import tempfile + ## generate qr code to the url + #qrname = tempfile.NamedTemporaryFile(prefix='pyqr_') + #qr = pyqrcode.create(self.url, error='L') + #qr.svg( + #qrname.name, + #xmldecl=False, + #omithw=True, + #scale=1, + #quiet_zone=0, + #svgclass='qr', + #lineclass='qrline' + #) + #with open(qrname.name) as f: + #qrsvg = f.read() + #f.close() + #return qrsvg + + @staticmethod + def pandoc_md2html(t, time=None): + if len(t) == 0: + return t + + cached = Cached(text="%s" % t, stime=time) + c = cached.get() + + if c: + return c + else: + extras = [ + 'backtick_code_blocks', + 'auto_identifiers', + 'fenced_code_attributes', + 'definition_lists', + 'grid_tables', + 'pipe_tables', + 'strikeout', + 'superscript', + 'subscript', + 'markdown_in_html_blocks', + 'shortcut_reference_links', + 'autolink_bare_uris', + 'raw_html', + 'link_attributes', + 'header_attributes', + 'footnotes', + ] + md = "markdown+" + "+".join(extras) + + t = pypandoc.convert_text(t, to='html5', format=md) + cached.set(t) + return t + + @staticmethod + def pandoc_html2md(t, time=None): + if len(t) == 0: + return t + + cached = Cached(text="%s" % t, stime=time) + c = cached.get() + + if c: + return c + else: + t = pypandoc.convert_text( + t, + to="markdown-" + "-".join([ + 'raw_html', + 'native_divs', + 'native_spans', + ]), + format='html' + ) + + cached.set(t) + return t + + + def tmpl(self): + return { + 'title': self.title, + 'published': self.date, + 'tags': self.tags, + 'author': glob.conf['author'], + 'content': self.content, + 'html': self.html, + 'category': self.category, + 'reactions': self.reactions, + 'updated': self.updated, + 'summary': self.sumhtml, + 'exif': self.exifmin, + 'lang': self.lang, + 'syndicate': self.syndicate, + 'slug': self.slug, + 'shortslug': self.shortslug, + 'srcset': self.srcset, + } + + @staticmethod + def write_redirect(sslug, target, tstamp=arrow.utcnow().timestamp): + + tmpl = glob.jinja2env.get_template('redirect.html') + jvars = { + 'url': target + } + r = tmpl.render(jvars) + # this is to support / ending urls even for the redirects + dirs = [ + os.path.join(glob.TARGET, sslug) + ] + + for d in dirs: + if not os.path.exists(d): + os.mkdir(d) + + files = [ + os.path.join(glob.TARGET, "%s.html" % (sslug)), + os.path.join(glob.TARGET, sslug, "index.html") + ] + for f in files: + if os.path.isfile(f): + rtime = os.stat(f) + if tstamp == rtime.st_mtime: + logging.debug( + "Unchanged dates on redirect file %s", f + ) + continue + + with open(f, "w") as html: + logging.info("writing redirect file %s", f) + html.write(r) + html.close() + os.utime(f, (tstamp,tstamp)) + + + def redirects(self): + """ Write redirect HTMLs """ + + if self.category == 'page': + return + + for sslug in self.redirect.keys(): + SingularHandler.write_redirect(sslug, self.url, self.ftime.st_mtime) + + def write(self): + """ Write HTML file """ + + if os.path.isfile(self.target): + ttime = os.stat(self.target) + if self.ftime.st_mtime == ttime.st_mtime and not glob.FORCEWRITE: + logging.debug( + "Unchanged dates on %s; skipping rendering and writing", + self.fname + ) + return + + tmpl = glob.jinja2env.get_template(self.template) + logging.info("rendering %s", self.fname) + tmplvars = { + 'post': self.tmpl(), + 'site': glob.conf['site'], + 'taxonomy': {}, + } + r = tmpl.render(tmplvars) + soup = BeautifulSoup(r,"html5lib") + r = soup.prettify() + + targets = [self.target] + for target in targets: + with open(target, "w") as html: + logging.info("writing %s", target) + html.write(r) + html.close() + os.utime(target, (self.ftime.st_mtime, self.ftime.st_mtime)) + + rdir = os.path.join(glob.TARGET, self.slug) + if not os.path.isdir(rdir): + os.mkdir(rdir) + + altdst = os.path.join(glob.TARGET, self.slug, 'index.html') + altsrc = os.path.join('..', self.target) + + if not os.path.islink(altdst): + if os.path.isfile(altdst): + os.unlink(altdst) + os.symlink(altsrc, altdst) + + #links = [] + #for r in self.reactions.items(): + #reactiontype, urls = r + #if isinstance(urls, str): + #links.append(urls) + #elif isinstance(urls, list): + #links = [*links, *urls] + + #if 1 == len(links): + #saved = os.path.join(glob.TARGET, self.slug, 'saved.html') + #if not os.path.isfile(saved): + #h, p = _localcopy_hashpath(links[0]) + #c = self._get_localcopy(links[0], h, p) + #with open(saved, 'w') as f: + #f.write(c) + #f.close() + + def index(self, ix): + """ Write search index """ + + writer = ix.writer() + + c = "%s %s %s %s %s" % ( + self.slug, + self.summary, + self._content, + yaml.dump(self.reactions, Dumper=yaml.RoundTripDumper), + yaml.dump(self.exifmin, Dumper=yaml.RoundTripDumper) + ) + + c = "%s %s" % (c, self._localcopy_include()) + + if self.img: + imgstr = self.img.mksrcset(generate_caption=False) + else: + imgstr = '' + + writer.add_document( + title=self.title, + url=self.url, + content=c, + date=self.date.datetime, + tags=",".join(self.tags), + weight=1, + img=imgstr + ) + writer.commit() + + + def pings(self): + """ Ping (webmention) all URLs found in the post """ + + links = [] + urlregex = re.compile( + r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+' + r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*' + ) + matches = re.findall(urlregex, self.content) + + for r in self.reactions.items(): + reactiontype, urls = r + if isinstance(urls, str): + matches.append(urls) + elif isinstance(urls, list): + matches = [*matches, *urls] + + #for s in self.syndicate.keys(): + #matches.append('https://brid.gy/publish/%s' % (s)) + + if self.utime and self.utime > 0: + time = self.utime + else: + time = self.dtime + + if len(matches) > 0: + for link in matches: + if glob.conf['site']['domain'] in link: + continue + + if link in links: + continue + + #self._localcopy(link) + self.pingdb.ping(self.url, link, time) + links.append(link) + + + def _localcopy_hashpath(self,url): + h = hashlib.md5(url.encode('utf-8')).hexdigest() + p = os.path.join(glob.LOCALCOPIES, "%s.html" % (h)) + return (h, p) + + + def _localcopy_include(self): + links = [] + md = '' + for r in self.reactions.items(): + reactiontype, urls = r + if isinstance(urls, str): + links.append(urls) + elif isinstance(urls, list): + links = [*links, *urls] + + for url in links: + h, p = self._localcopy_hashpath(url) + html = self._get_localcopy(url, h, p) + md = "%s %s" % ( + md, + SingularHandler.pandoc_html2md(html, os.stat(p)) + ) + + return md + + + def _get_localcopy(self, url, h, p): + html = '' + + if os.path.isfile(p): + with open(p, 'r') as f: + html = f.read() + f.close() + else: + html = self._make_localcopy(url, h, p) + + return html + + + def _make_localcopy(self, url, h, p): + post = self._pull_localcopy(url) + tmpl = glob.jinja2env.get_template('localcopy.html') + html = tmpl.render({'post': post}) + soup = BeautifulSoup(html,"html5lib") + html = soup.prettify() + + with open(p, "w") as f: + logging.info("saving readable copy of %s to %s", url, p) + f.write(html) + f.close() + + return html + + + def _pull_localcopy(self, url): + + # find the true URL + # MAYBE: add fallback to archive.org? + realurl = url + try: + pretest = requests.head(url, allow_redirects=True, timeout=30) + realurl = pretest.url + except: + pass + + parsed = { + 'lang': 'en', + 'url': url, + 'realurl': realurl, + 'html': '', + 'title': '', + 'excerpt': '', + 'byline': '', + } + + if 'readable' in glob.conf and \ + 'port' not in glob.conf['readable'] and \ + 'host' not in glob.conf['readable']: + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + socktest = sock.connect_ex(( + glob.conf['readable']['host'], int(glob.conf['readable']['port']) + )) + if 0 == socktest: + text = self._localcopy_via_proxy(realurl) + parsed['html'] = text.get('content','') + parsed['title'] = text.get('title',url) + parsed['excerpt'] = text.get('excerpt', '') + parsed['byline'] = text.get('byline', '') + + try: + parsed['lang'] = langdetect.detect(parsed['html']) + except: + pass + + return parsed + + # TODO: fallback to full-python solution if the previous failed + return parsed + + + def _localcopy_via_proxy(self, url): + r = "http://%s:%s/api/get?url=%s&sanitize=y" % ( + glob.conf['readable']['host'], + glob.conf['readable']['port'], + url + ) + + try: + req = requests.get(r,allow_redirects=False,timeout=60); + except: + return None + + text = {} + try: + text = json.loads(req.text) + except: + pass + + return text + + + def _adaptify(self): + """ Generate srcset for all images possible """ + + linkto = False + isrepost = None + + if len(self.reactions.keys()): + isrepost = list(self.reactions.keys())[0] + + if isrepost: + if len(self.reactions[isrepost]) == 1: + linkto = self.reactions[isrepost][0] + + mdmatch = re.compile( + r'!\[.*\]\(.*?\.(?:jpe?g|png|gif)' + r'(?:\s+[\'\"]?.*?[\'\"]?)?\)(?:\{.*?\})?' + ) + mdsplit = re.compile( + r'!\[(.*)\]\((?:\/(?:files|cache)' + r'(?:\/[0-9]{4}\/[0-9]{2})?\/(.*\.(?:jpe?g|png|gif)))' + r'(?:\s+[\'\"]?(.*?)[\'\"]?)?\)(?:\{(.*?)\})?' + ) + mdimg = re.findall(mdmatch, self.content) + for i in mdimg: + m = re.match(mdsplit, i) + if m: + #logging.info(m.groups()) + imgpath = os.path.join(glob.SFILES, m.group(2)) + + if not os.path.isfile(imgpath): + for c in glob.conf['category'].items(): + catn, catd = c + catp = os.path.abspath(os.path.join(glob.CONTENT, catn)) + + if not os.path.exists(catp) \ + or not 'type' in catd \ + or catd['type'] != 'photo': + continue + + imgpath = os.path.join(catp, m.group(2)) + break + + if os.path.isfile(imgpath): + + t = '' + if m.group(3): + t = m.group(3) + + cl = '' + if m.group(4): + cl = m.group(4) + + a = '' + if m.group(1): + a = m.group(1) + + im = img.ImageHandler( + imgpath, + alttext=a, + title=t, + imgcl=cl, + linkto=linkto + ) + + im.downsize() + logging.debug("replacing image %s with srcset", imgpath) + srcset = im.mksrcset() + if srcset: + self.content = self.content.replace(i, srcset) + del(im) + else: + logging.error("%s missing %s", m.group(2), self.fpath) + + def _video(self): + """ [video] shortcode extractor """ + + match = re.compile(r'\[video mp4=\"/(?:files|cache).*?\"\]\[/video\]') + split = re.compile(r'\[video mp4=\"(/(?:files|cache)\/(.*?))\"\]\[/video\]') + videos = re.findall(match, self.content) + for vid in videos: + v = re.match(split, vid) + video = """ + """ % (v.group(1)) + self.content = self.content.replace(vid, video) + + #def _files(self): + #""" Copy misc files referenced """ + + #match = re.compile( + #r'\s(?:%s)?/(?:files|cache)' + #r'/.*\.(?:(?!jpe?g|png|gif).*)\s' % (glob.conf['site']['domain']) + #) + #split = re.compile( + #r'\s(?:%s)?/((?:files|cache)' + #r'/(.*\.(?:(?!jpe?g|png|gif).*)))\s' % (glob.conf['site']['domain']) + #) + ##files = re.findall(match, self.content) + ##print(files) + + def _snippets(self): + """ Replaces [git:(repo)/(file.ext)] with corresponding code snippet """ + + snmatch = re.compile(r'\[git:[^\/]+\/(?:.*\..*)\]') + snsplit = re.compile(r'\[git:([^\/]+)\/((?:.*)\.(.*))\]') + snippets = re.findall(snmatch, self.content) + isconf = re.compile(r'conf', re.IGNORECASE) + for snippet in snippets: + sn = re.match(snsplit, snippet) + if sn: + fpath = os.path.join(glob.SOURCE, sn.group(1), sn.group(2)) + if not os.path.isfile(fpath): + logging.error( + "missing blogsnippet in %s: %s", + self.fpath, + fpath + ) + continue + + if re.match(isconf, sn.group(3)): + lang = 'apache' + else: + lang = sn.group(3) + + with open(fpath, "r") as snip: + c = snip.read() + snip.close + + c = "\n\n```%s\n%s\n```\n" % (lang, c) + logging.debug("replacing blogsnippet %s", fpath) + self.content = self.content.replace(snippet, c) + + @staticmethod + def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"): + """ Used to create short, lowecase slug for a number (an epoch) passed """ + num = int(num) + return ((num == 0) and numerals[0]) or ( + SingularHandler.baseN( + num // b, + b, + numerals + ).lstrip(numerals[0]) + numerals[num % b] + ) + + + +class ArticleHandler(SingularHandler): + + def __init__(self, *args, **kwargs): + super(ArticleHandler, self).__init__(*args, **kwargs) + self.dctype = 'Text' + self._setup() + + def _setup(self): + post = frontmatter.load(self.fpath) + self.meta = post.metadata + self.content = post.content + self._content = '%s' % (self.content) + + if 'tags' in post.metadata: + self.tags = post.metadata['tags'] + + if 'title' in post.metadata: + self.title = post.metadata['title'] + + if 'published' in post.metadata: + self.date = arrow.get(post.metadata['published']) + + if 'updated' in post.metadata: + self.updated = arrow.get(post.metadata['updated']) + + if 'summary' in post.metadata: + self.summary = post.metadata['summary'] + + if 'redirect' in post.metadata and \ + isinstance(post.metadata['redirect'], list): + for r in post.metadata['redirect']: + self.redirect[r] = 1 + + if 'syndicate' in post.metadata: + z = post.metadata['syndicate'] + if isinstance(z, str): + self.syndicate[z] = '' + elif isinstance(z, dict): + for s, c in z.items(): + self.syndicate[s] = c + elif isinstance(z, list): + for s in z: + self.syndicate[s] = '' + + self.reactions = {} + + # getting rid of '-' to avoid css trouble and similar + rmap = { + 'bookmark-of': 'bookmark', + 'repost-of': 'repost', + 'in-reply-to': 'reply', + } + + for x in rmap.items(): + key, replace = x + if key in self.meta: + if isinstance(self.meta[key], str): + self.reactions[replace] = [self.meta[key]] + elif isinstance(self.meta[key], list): + self.reactions[replace] = self.meta[key] + + self._adaptify() + self._snippets() + self._video() + #self._files() + super(ArticleHandler, self)._postsetup() + + +class PhotoHandler(SingularHandler): + + def __init__(self, *args, **kwargs): + super(PhotoHandler, self).__init__(*args, **kwargs) + self.dctype = 'Image' + self.img = img.ImageHandler(self.fpath) + self.exif = self.img.exif + self._setup() + + def _setup(self): + self.syndicate = { + 'flickr': '', + } + + keywords = [ + 'XMP:Keywords', + 'IPTC:Keywords' + ] + tags = {} + for key in keywords: + if key in self.exif and self.exif[key]: + + if isinstance(self.exif[key], str): + self.exif[key] = self.exif[key].split(",") + + if isinstance(self.exif[key], list): + for tag in self.exif[key]: + tags[str(tag).strip()] = 1 + + self.tags = list(tags.keys()) + + # content + keywords = [ + 'XMP:Description', + 'IPTC:Caption-Abstract' + ] + for key in keywords: + if key in self.exif and self.exif[key]: + self.content = self.exif[key] + break + self._content = '%s' % (self.content) + + # title + keywords = [ + 'XMP:Title', + 'XMP:Headline', + 'IPTC:Headline' + ] + for key in keywords: + if key in self.exif and self.exif[key]: + self.title = self.exif[key] + break + + # datetime + keywords = [ + 'XMP:DateTimeDigitized', + 'XMP:CreateDate', + 'EXIF:CreateDate', + 'EXIF:ModifyDate' + ] + + pattern = re.compile( + "(?P[0-9]{4}):(?P[0-9]{2}):(?P[0-9]{2})\s+" + "(?P[0-9]{2}:[0-9]{2}:[0-9]{2})Z?" + ) + + for key in keywords: + if key not in self.exif or not self.exif[key]: + continue + + date = None + v = pattern.match(self.exif[key]).groupdict() + if not v: + continue + + try: + date = arrow.get('%s-%s-%s %s' % (v['Y'], v['M'], v['D'], v['T'])) + except: + continue + + if date: + self.date = date + logging.debug("date for %s is set to %s from key %s", self.fname, self.date, key) + break + + self.img.title = self.title + self.img.alttext = self.content + self.content = self.content + "\n\n" + self.img.mksrcset(generate_caption=False, uphoto=True) + + self.img.downsize() + self.srcset = self.img.mksrcset(generate_caption=False, uphoto=False) + super(PhotoHandler, self)._postsetup() + + + def tmpl(self): + tmpl = super(PhotoHandler, self).tmpl() + tmpl['exif'] = {} + + mapping = { + 'camera': [ + 'EXIF:Model' + ], + 'aperture': [ + 'EXIF:FNumber', + 'Composite:Aperture' + ], + 'shutter_speed': [ + 'EXIF:ExposureTime' + ], + 'focallength': [ + 'EXIF:FocalLength', + 'Composite:FocalLength35efl', + ], + 'iso': [ + 'EXIF:ISO' + ], + 'lens': [ + 'Composite:LensID', + 'MakerNotes:Lens', + 'Composite:LensSpec' + ] + } + + for ekey, candidates in mapping.items(): + for candidate in candidates: + if candidate in self.exif: + tmpl['exif'][ekey] = self.exif[candidate] + break + + gps = ['Latitude', 'Longitude'] + for g in gps: + gk = 'EXIF:GPS%s' % (g) + if gk not in self.exif: + continue + + r = 'EXIF:GPS%sRef' % (g) + ref = None + if r in self.exif: + ref = self.exif[r] + + tmpl['exif']['geo_%s' % (g.lower())] = self.gps2dec( + self.exif[gk], + ref + ) + + ##tmpl['imgurl'] = '' + #sizes = collections.OrderedDict(reversed(list(self.img.sizes.items()))) + #for size, meta in sizes.items(): + #if os.path.isfile(meta['path']): + #with Image.open(meta['path']) as im: + #meta['width'], meta['height'] = im.size + #meta['size'] = os.path.getsize(meta['path']) + #tmpl['img'] = meta + #break + + tmpl['img'] = self.img.meta + return tmpl + + + @staticmethod + def gps2dec(exifgps, ref=None): + pattern = re.compile(r"(?P[0-9.]+)\s+deg\s+(?P[0-9.]+)'\s+(?P[0-9.]+)\"(?:\s+(?P[NEWS]))?") + v = pattern.match(exifgps).groupdict() + + dd = float(v['deg']) + (((float(v['min']) * 60) + (float(v['sec']))) / 3600) + if ref == 'West' or ref == 'South' or v['dir'] == "S" or v['dir'] == "W": + dd = dd * -1 + return round(dd, 6) + + + +class PageHandler(SingularHandler): + + def __init__(self, *args, **kwargs): + super(PageHandler, self).__init__(*args, **kwargs) + self._setup() + + def _setup(self): + with open(self.fpath) as c: + self.content = c.read() + c.close() + + self._content = '%s' % (self.content) + self._adaptify() + super(PageHandler, self)._postsetup() + self.template = 'page.html' \ No newline at end of file diff --git a/taxonomy.py b/taxonomy.py new file mode 100644 index 0000000..f69f711 --- /dev/null +++ b/taxonomy.py @@ -0,0 +1,253 @@ +import math +import logging +import os +import collections +import json +import glob +from slugify import slugify +from bs4 import BeautifulSoup +from pprint import pprint + +class TaxonomyHandler(object): + + def __init__(self, taxonomy='', name='', description='', exclude=False): + self.taxonomy = taxonomy + self.name = name + self.description = description + self.exclude = exclude + self.slug = slugify(self.name, only_ascii=True, lower=True) + self.posts = collections.OrderedDict() + + self.taxp = os.path.join(glob.TARGET, self.taxonomy) + self.simplepath = os.path.join(self.taxp, 'index.html') + self.basep = os.path.join(self.taxp, self.slug) + self.pagedp = os.path.join(self.basep, 'page') + self.indexpath = os.path.join(self.basep, 'index.html') + + self.lptime = 0 + + def __getitem__(self, key): + return self.posts[key] + + def __repr__(self): + return 'Taxonomy %s (name: %s, slug: %s) with %i posts' % ( + self.taxonomy, + self.name, + self.slug, + len(self.posts) + ) + + def __next__(self): + try: + r = self.posts.next() + except: + raise StopIteration() + return r + + def __iter__(self): + for ix, post in self.posts.items(): + yield post + return + + + def append(self, post): + k = int(post.date.timestamp) + if k in self.posts: + inc = 1 + while k in self.posts: + k = int(k+1) + + self.posts[k] = post + self.posts = collections.OrderedDict(sorted(self.posts.items(), reverse=True)) + + + def index(self, ix): + """ Write search index """ + + writer = ix.writer() + + t, lp = list(self.posts.items())[0] + + writer.add_document( + title=self.name, + url="%s/%s/%s" % (glob.conf['site']['url'], self.taxonomy, self.slug), + content="%s %s" % (self.name, self.slug), + date=lp.date.datetime, + tags=",".join([self.name]), + weight=10 + ) + writer.commit() + + + def _test_freshness(self): + t, lp = list(self.posts.items())[0] + self.lptime = lp.ftime.st_mtime + + if os.path.isfile(self.indexpath): + p = self.indexpath + elif os.path.isfile(self.simplepath): + p = self.simplepath + else: + return False + + itime = os.stat(p) + if itime.st_mtime == self.lptime and not glob.FORCEWRITE: + logging.debug( + 'Taxonomy tree is fresh for %s' % (self.name) + ) + return True + + return False + + + def _test_dirs(self): + if not os.path.isdir(self.taxp): + os.mkdir(self.taxp) + if not os.path.isdir(self.basep): + os.mkdir(self.basep) + + + def write_paginated(self): + + if self._test_freshness(): + return + + self._test_dirs() + + taxp = os.path.join(glob.TARGET, self.taxonomy) + basep = os.path.join(glob.TARGET, self.taxonomy, self.slug) + + if not os.path.isdir(taxp): + os.mkdir(taxp) + if not os.path.isdir(basep): + os.mkdir(basep) + + + pages = math.ceil(len(self.posts) / glob.conf['perpage']) + page = 1 + + + if len(self.taxonomy) and len(self.slug): + base_url = "/%s/%s/" % (self.taxonomy, self.slug) + else: + base_url = '/' + + + while page <= pages: + start = int((page-1) * int(glob.conf['perpage'])) + end = int(start + int(glob.conf['perpage'])) + dorss = False + posttmpls = [self.posts[k].tmpl() for k in list(sorted( + self.posts.keys(), reverse=True))[start:end]] + + if page == 1: + tpath = self.indexpath + do_rss = True + # RSS + + else: + do_rss = False + if not os.path.isdir(self.pagedp): + os.mkdir(self.pagedp) + + tdir = os.path.join(self.pagedp, "%d" % page) + + if not os.path.isdir(tdir): + os.mkdir(tdir) + tpath = os.path.join(tdir, "index.html") + + tvars = { + 'taxonomy': { + 'url': base_url, + 'name': self.name, + 'taxonomy': self.taxonomy, + 'description': self.description, + 'paged': page, + 'total': pages, + 'perpage': glob.conf['perpage'], + }, + 'site': glob.conf['site'], + 'posts': posttmpls, + } + + + tmpl = glob.jinja2env.get_template('archive.html') + logging.info("rendering %s" % (tpath)) + with open(tpath, "w") as html: + r = tmpl.render(tvars) + soup = BeautifulSoup(r, "html5lib") + r = soup.prettify() + logging.info("writing %s" % (tpath)) + html.write(r) + html.close() + os.utime(tpath, (self.lptime, self.lptime)) + + if do_rss: + feeddir = os.path.join(self.basep, 'feed') + if not os.path.isdir(feeddir): + os.mkdir(feeddir) + feedpath = os.path.join(feeddir, "index.xml") + tmpl = glob.jinja2env.get_template('rss.html') + logging.info("rendering %s" % (feedpath)) + with open(feedpath, "w") as html: + r = tmpl.render(tvars) + logging.info("writing %s" % (feedpath)) + html.write(r) + html.close() + os.utime(feedpath, (self.lptime, self.lptime)) + + page = page+1 + + def write_simple(self, template='archive.html'): + + if self._test_freshness(): + return + + self._test_dirs() + + base_url = "/%s/" % (self.slug) + + posttmpls = [self.posts[k].tmpl() for k in list(sorted( + self.posts.keys(), reverse=True))] + + tvars = { + 'taxonomy': { + 'url': base_url, + 'name': self.name, + 'taxonomy': self.taxonomy, + 'description': self.description, + 'paged': 0, + 'total': 0, + 'perpage': glob.conf['perpage'], + }, + 'site': glob.conf['site'], + 'posts': posttmpls, + } + + with open(os.path.join(self.simplepath), "w") as html: + html.write(json.dumps(tvars, indent=4, sort_keys=True, default=str)) + html.close() + + #tmpl = glob.jinja2env.get_template('gallery.html') + #logging.info("rendering %s" % (indexpath)) + #with open(indexpath, "w") as html: + #r = tmpl.render(tvars) + #soup = BeautifulSoup(r, "html5lib") + #r = soup.prettify() + #logging.info("writing %s" % (indexpath)) + #html.write(r) + #html.close() + #os.utime(indexpath, (lptime, lptime)) + + + def writesitemap(self): + sitemap = "%s/sitemap.txt" % (glob.TARGET) + urls = [] + for p in self.posts.items(): + t, data = p + urls.append( "%s/%s" % ( glob.conf['site']['url'], data.slug ) ) + + with open(sitemap, "w") as f: + logging.info("writing %s" % (sitemap)) + f.write("\n".join(urls)) + f.close() \ No newline at end of file diff --git a/update.sh b/update.sh new file mode 100755 index 0000000..aba1b63 --- /dev/null +++ b/update.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +if [ -f "/tmp/petermolnar.net.generator.lock" ]; then + exit 0; +fi; + +lastfile="$(find /home/petermolnar.net/source/ -type f -name *.md -printf '%T+ %p\n' | sort | tail -n1 | awk '{print $2}')"; +lastfilemod=$(stat -c %Y "$lastfile"); +lastrunfile="/tmp/generator_last_run"; +lastrun=0; + +if [ -f "$lastrunfile" ]; then + lastrun=$(stat -c %Y "$lastrunfile"); +fi; + +if [ "$lastrun" -lt "$lastfilemod" ]; then + cd /home/petermolnar.net/src; ../.venv/bin/python3.5 generator.py; +fi; + +exit 0; diff --git a/webmentiondb.py b/webmentiondb.py new file mode 100644 index 0000000..42f27ce --- /dev/null +++ b/webmentiondb.py @@ -0,0 +1,103 @@ +import os +import hashlib +import logging +import glob +from webmentiontools.send import WebmentionSend +import requests +import json + +class WebmentionDB(object): + dbpath = glob.WEBMENTIONDB + + def __init__(self): + self.sent = {} + self._loaddb() + + def _loaddb(self): + if os.path.isfile(self.dbpath): + logging.info("loading pinged database") + with open(self.dbpath, 'r') as db: + self.sent = json.loads(db.read()) + + def _dumpdb(self): + with open(self.dbpath, "w") as db: + logging.info("writing pinged database") + db.write(json.dumps(self.sent, indent=4, sort_keys=True)) + db.close() + + def _refreshdb(self): + self._dumpdb() + self._loaddb() + + def __getitem__(self, key): + r = {} + for i in self.sent.items(): + h, data = i + if data['source'] == key: + r[data['target']] = { + 'time': data['time'], + 'response': data['response'] + } + + return r + + + def __len__(self): + return len(self.sent) + + + def posses(self, key): + r = [] + for i in self.sent.items(): + h, data = i + + if data['source'] != key: + continue + + if not len(data['response']): + continue + + if 'url' not in data['response']: + continue + + r.append(data['response']['url']) + + return r + + + def ping(self, source, target, time=0, posse=False): + resp = {} + source = source.strip() + target = target.strip() + + h = source + target + "%i" % (int(time)) + h = h.encode('utf-8') + h = hashlib.sha1(h).hexdigest() + if h in self.sent.keys(): + logging.debug("already pinged: %s" % (target)) + return True + + logging.debug("pinging: %s" % (target)) + + wm = WebmentionSend(source, target) + if hasattr(wm, 'response'): + resp = wm.response + + # fire and forget archive.org call + try: + verify = requests.get( + '%s%s' % ('https://web.archive.org/save/', target), + allow_redirects=False, + timeout=30, + ) + except: + pass + + self.sent[h] = { + 'source': source, + 'target': target, + 'time': time, + 'response': resp + } + + self._refreshdb() \ No newline at end of file