#!/usr/bin/env python3 import os import re import logging import configparser import json import glob import argparse import shutil from urllib.parse import urlparse #from urllib.parse import urljoin import asyncio from math import ceil import csv import sqlite3 import frontmatter import arrow import langdetect import wand.image #import requests #from bs4 import BeautifulSoup from emoji import UNICODE_EMOJI import shared import db from pprint import pprint class MagicPHP(object): name = 'magic.php' def __init__(self): # init 'gone 410' array self.gones = [] f = shared.config.get('var', 'gone') if os.path.isfile(f): with open(f) as csvfile: reader = csv.reader(csvfile, delimiter=' ') for row in reader: self.gones.append(row[0]) # init manual redirects array self.redirects = [] f = shared.config.get('var', 'redirects') if os.path.isfile(f): with open(f) as csvfile: reader = csv.reader(csvfile, delimiter=' ') for row in reader: self.redirects.append((row[0], row[1])) @property def phpfile(self): return os.path.join( shared.config.get('common', 'build'), self.name ) async def render(self): logging.info('saving %s' % (self.name)) o = self.phpfile tmplfile = "%s.html" % (__class__.__name__) r = shared.j2.get_template(tmplfile).render({ 'site': shared.site, 'redirects': self.redirects, 'gones': self.gones }) with open(o, 'wt') as out: logging.debug('writing file %s' % (o)) out.write(r) class NoDupeContainer(object): """ Base class to hold keys => data dicts with errors on dupes """ def __init__(self): self.data = {} self.default = None def append(self, key, value): # all clear if key not in self.data: self.data.update({key: value}) return # problem logging.error( "duplicate key error when populating %s: %s", self.__class__.__name__, key ) logging.error( "current: %s", self.data.get(key) ) logging.error( "problem: %s", value ) return # TODO: return ordered version of data def __getitem__(self, key): return self.data.get(key, self.default) #def __delitem__(self, key): #return del(self.data[key]) def __setitem__(self, key, value): return self.append(key, value) def __contains__(self, key): if key in self.data.keys(): return True return False def __len__(self): return len(self.data.keys()) def __next__(self): try: r = self.data.next() except: raise StopIteration() return r def __iter__(self): for k, v in self.data.items(): yield (k, v) return #def __repr__(self): #return json.dumps(self.data) #def __str__(self): #return "iteration container with %s items" % (len(self.data.keys())) class FContainer(NoDupeContainer): """ This is a container that holds a lists of files based on Container so it errors on duplicate slugs and is popolated with recorsive glob """ def __init__(self, dirs=[''], extensions=['*']): super().__init__() files = [] for ext in extensions: for p in dirs: files.extend(glob.iglob( os.path.join(p,'*.%s' % (ext)), recursive=True )) # eliminate duplicates files = list(set(files)) for fpath in files: fname = os.path.basename(fpath) self.append(fname, fpath) class Content(FContainer): """ This is a container that holds markdown files that are parsed when the container is populated on the fly; based on FContainer which is a Container """ def __init__(self): dirs=[os.path.join(shared.config.get('dirs', 'content'), "**")] extensions=['md', 'jpg'] super().__init__(dirs, extensions) for fname, fpath in self.data.items(): self.data.update({fname: Singular(fpath)}) class Category(NoDupeContainer): """ A Category which holds pubtime (int) => Singular data """ indexfile = 'index.html' feedfile = 'index.atom' feeddir = 'feed' pagedir = 'page' taxonomy = 'category' def __init__(self, name=''): self.name = name super().__init__() def append(self, post): return super().append(post.pubtime, post) @property def mtime(self): return int(sorted(self.data.keys(), reverse=True)[0]) @property def is_uptodate(self): index = os.path.join(self.path_paged(), self.indexfile) if not os.path.isfile(index): return False mtime = os.path.getmtime(index) if mtime == self.mtime: return True return False @property def title(self): # TODO proper title return self.name def url_paged(self, page=1, feed=False): x = '/' if self.name: x = "%s%s/%s" % ( x, self.taxonomy, self.name, ) if page == 1 and feed: x = "%s/%s/" % (x, self.feeddir) else: x = "%s/%s/%s/" % (x, self.pagedir, "%s" % page) return x def path_paged(self, page=1, feed=False): x = shared.config.get('common', 'build') if self.name: x = os.path.join( x, self.taxonomy, self.name, ) if page == 1: if feed: x = os.path.join(x, self.feeddir) else: x = os.path.join(x, self.pagedir, "%s" % page) if not os.path.isdir(x): os.makedirs(x) return x def write_html(self, path, content): with open(path, 'wt') as out: logging.debug('writing file %s' % (path)) out.write(content) os.utime(path, (self.mtime, self.mtime)) async def render(self): if self.is_uptodate: return pagination = shared.config.getint('display', 'pagination') pages = ceil(len(self.data) / pagination) page = 1 while page <= pages: # list relevant post templates start = int((page-1) * pagination) end = int(start + pagination) posttmpls = [ self.data[k].tmplvars for k in list(sorted( self.data.keys(), reverse=True ))[start:end] ] # define data for template tmplvars = { 'taxonomy': { 'title': self.title, 'name': self.name, 'page': page, 'total': pages, 'perpage': pagination, 'lastmod': arrow.get(self.mtime).format(shared.ARROWFORMAT['iso']), 'feed': self.url_paged(page=1, feed=True), 'url': self.url_paged(page), }, 'site': shared.site, 'posts': posttmpls, } # render HTML dirname = self.path_paged(page) o = os.path.join(dirname, self.indexfile) logging.info("Rendering page %d/%d of category %s to %s", page, pages, self.name, o) tmplfile = "%s.html" % (__class__.__name__) r = shared.j2.get_template(tmplfile).render(tmplvars) self.write_html(o, r) # render feed if 1 == page: dirname = self.path_paged(page, feed=True) o = os.path.join(dirname, self.feedfile) logging.info("Rendering feed of category %s to %s", self.name, o) tmplfile = "%s_%s.html" % (__class__.__name__, self.feeddir) r = shared.j2.get_template(tmplfile).render(tmplvars) self.write_html(o, r) # inc. page counter page = page+1 class Singular(object): indexfile = 'index.html' def __init__(self, fpath): logging.debug("initiating singular object from %s", fpath) self.fpath = fpath self.mtime = os.path.getmtime(self.fpath) self.fname, self.fext = os.path.splitext(os.path.basename(self.fpath)) self.category = os.path.basename(os.path.dirname(self.fpath)) self._images = NoDupeContainer() if '.md' == self.fext: with open(self.fpath, mode='rt') as f: self.fm = frontmatter.parse(f.read()) self.meta, self.content = self.fm self.photo = None elif '.jpg' == self.fext: self.photo = WebImage(self.fpath) self.meta = self.photo.fm_meta self.content = self.photo.fm_content self.photo.inline = False self.photo.cssclass = 'u-photo' @property def redirects(self): r = self.meta.get('redirect', []) r.append(self.shortslug) return list(set(r)) @property def is_uptodate(self): if not os.path.isfile(self.htmlfile): return False mtime = os.path.getmtime(self.htmlfile) if mtime == self.mtime: return True return False @property def htmlfile(self): return os.path.join( shared.config.get('common', 'build'), self.fname, self.indexfile ) @property def images(self): if self.photo: self._images.append(self.fname, self.photo) # add inline images for shortcode, alt, fname, title, css in self.inline_images: # this does the appending automatically im = self._find_image(fname) return self._images @property def comments(self): comments = NoDupeContainer() cfiles = [] lookin = [*self.redirects, self.fname] for d in lookin: maybe = glob.glob( os.path.join( shared.config.get('dirs', 'comment'), d, '*.md' ) ) cfiles = [*cfiles, *maybe] for cpath in cfiles: c = Comment(cpath) comments.append(c.mtime, c) return comments @property def replies(self): r = {} for mtime, c in self.comments: if 'webmention' == c.type: r.update({mtime:c.tmplvars}) return sorted(r.items()) @property def reactions(self): r = {} for mtime, c in self.comments: if 'webmention' == c.type: continue if c.type not in r: r[c.type] = {} r[c.type].update({mtime:c.tmplvars}) for icon, comments in r.items(): r[icon] = sorted(comments.items()) return r @property def exif(self): if not self.photo: return {} return self.photo.exif @property def published(self): return arrow.get(self.meta.get('published', self.mtime)) @property def updated(self): u = self.meta.get('updated', False) if u: u = arrow.get(u) return u @property def pubtime(self): return int(self.published.timestamp) @property def is_reply(self): return self.meta.get('in-reply-to', False) @property def is_future(self): now = arrow.utcnow().timestamp if self.pubtime > now: return True return False @property def licence(self): l = shared.config.get('licence', self.category, fallback=shared.config.get('licence', 'default',)) return { 'text': 'CC %s 4.0' % l.upper(), 'url': 'https://creativecommons.org/licenses/%s/4.0/' % l, } @property def corpus(self): corpus = "\n".join([ "%s" % self.meta.get('title', ''), "%s" % self.fname, "%s" % self.meta.get('summary', ''), "%s" % self.content, ]) if self.photo: corpus = corpus + "\n".join(self.meta.get('tags', [])) return corpus @property def lang(self): # default is English, this will only be changed if the try # succeeds and actually detects a language lang = 'en' try: lang = langdetect.detect("\n".join([ self.fname, self.meta.get('title', ''), self.content ])) except: pass return lang def _find_image(self, fname): pattern = os.path.join( shared.config.get('dirs', 'files'), '*', fname ) logging.debug('trying to locate image %s in %s', fname, pattern) maybe = glob.glob(pattern) if not maybe: return None if fname not in self._images: im = WebImage(maybe.pop()) self._images.append(fname,im) return self._images[fname] @property def inline_images(self): return shared.REGEX['mdimg'].findall(self.content) @property def url(self): return "%s/%s" % (shared.config.get('site', 'url'), self.fname) @property def body(self): body = "%s" % (self.content) # get inline images, downsize them and convert them to figures for shortcode, alt, fname, title, css in self.inline_images: fname = os.path.basename(fname) im = self._find_image(fname) if not im: continue im.alt = alt im.title = title im.cssclass = css body = body.replace(shortcode, str(im)) # TODO if multiple meta images, inline all except the first # which will be added at the HTML stage or as enclosure to the feed return body @property def html(self): html = "%s" % (self.body) # add photo if self.photo: html = "%s\n%s" % (str(self.photo), html) return shared.Pandoc().convert(html) @property def title(self): maybe = self.meta.get('title', False) if maybe: return maybe if self.is_reply: return "RE: %s" % self.is_reply return self.published.format(shared.ARROWFORMAT['display']) @property def summary(self): s = self.meta.get('summary', '') if not s: return s return shared.Pandoc().convert(s) @property def shortslug(self): return shared.baseN(self.pubtime) @property def tmplvars(self): # very simple caching because we might use this 4 times: # post HTML, category, front posts and atom feed if not hasattr(self, '_tmplvars'): self._tmplvars = { 'title': self.title, 'pubtime': self.published.format(shared.ARROWFORMAT['iso']), 'pubdate': self.published.format(shared.ARROWFORMAT['display']), 'category': self.category, 'html': self.html, 'lang': self.lang, 'slug': self.fname, 'shortslug': self.shortslug, 'licence': self.licence, #'sourceurl': self.sourceurl, 'is_reply': self.is_reply, 'age': int(self.published.format('YYYY')) - int(arrow.utcnow().format('YYYY')), 'summary': self.summary, 'replies': self.replies, 'reactions': self.reactions, } return self._tmplvars async def render(self): logging.info('rendering %s' % (self.fname)) o = self.htmlfile if self.is_uptodate: logging.debug('%s is up to date' % (o)) return tmplfile = "%s.html" % (__class__.__name__) r = shared.j2.get_template(tmplfile).render({ 'post': self.tmplvars, 'site': shared.site, }) d = os.path.dirname(o) if not os.path.isdir(d): logging.debug('creating directory %s' % (d)) os.makedirs(d) with open(o, 'wt') as out: logging.debug('writing file %s' % (o)) out.write(r) os.utime(o, (self.mtime, self.mtime)) def __repr__(self): return "%s/%s" % (self.category, self.fname) class WebImage(object): def __init__(self, fpath): logging.info("parsing image: %s", fpath) self.fpath = fpath self.mtime = os.path.getmtime(self.fpath) bname = os.path.basename(fpath) self.fname, self.fext = os.path.splitext(bname) self.title = '' self.alt = bname self.target = '' self.cssclass = '' @property def fm_content(self): return self.meta.get('Description', '') @property def fm_meta(self): return { 'published': self.meta.get('ReleaseDate', self.meta.get('ModifyDate') ), 'title': self.meta.get('Headline', self.fname), 'tags': list(set(self.meta.get('Subject', []))), } @property def href(self): if len(self.target): return self.target if not self.is_downsizeable: return False return self.sizes[-1][1]['url'] @property def src(self): # is the image is too small to downsize, it will be copied over # so the link needs to point at src = "/%s/%s" % ( shared.config.get('common', 'files'), "%s%s" % (self.fname, self.fext) ) if self.is_downsizeable: try: src = [e for e in self.sizes if e[0] == shared.config.getint('photo', 'default')][0][1]['url'] except: pass return src @property def meta(self): if not hasattr(self, '_exif'): # reading EXIF is expensive enough even with a static generator # to consider caching it, so I'll do that here cpath = os.path.join( shared.config.get('var', 'cache'), "%s.exif.json" % self.fname ) if os.path.exists(cpath): cmtime = os.path.getmtime(cpath) if cmtime >= self.mtime: with open(cpath, 'rt') as f: self._exif = json.loads(f.read()) return self._exif self._exif = shared.ExifTool(self.fpath).read() if not os.path.isdir(shared.config.get('var', 'cache')): os.makedirs(shared.config.get('var', 'cache')) with open(cpath, 'wt') as f: f.write(json.dumps(self._exif)) return self._exif @property def is_photo(self): # missing regex from config if 'photo' not in shared.REGEX: logging.debug('%s photo regex missing from config') return False cpr = self.meta.get('Copyright', '') art = self.meta.get('Artist', '') # both Artist and Copyright missing from EXIF if not cpr and not art: logging.debug('%s Artist or Copyright missing from EXIF') return False # we have regex, Artist and Copyright, try matching them pattern = re.compile(shared.config.get('photo', 'regex')) if pattern.search(cpr) or pattern.search(art): return True logging.debug('%s patterns did not match') return False @property def exif(self): exif = {} if not self.is_photo: return exif mapping = { 'camera': ['Model'], 'aperture': ['FNumber','Aperture'], 'shutter_speed': ['ExposureTime'], 'focallength': ['FocalLengthIn35mmFormat', 'FocalLength'], 'iso': ['ISO'], 'lens': ['LensID', 'LensSpec', 'Lens',], #'date': ['CreateDate','DateTimeOriginal'], 'geo_latitude': ['GPSLatitude'], 'geo_longitude': ['GPSLongitude'], } for ekey, candidates in mapping.items(): for candidate in candidates: maybe = self.meta.get(candidate, None) if not maybe: continue elif 'geo_' in ekey: exif[ekey] = round(float(maybe), 5) else: exif[ekey] = maybe break return exif @property def sizes(self): sizes = [] _max = max( int(self.meta.get('ImageWidth')), int(self.meta.get('ImageHeight')) ) for size in shared.config.options('downsize'): if _max < int(size): continue name = '%s_%s%s' % ( self.fname, shared.config.get('downsize', size), self.fext ) fpath = os.path.join( shared.config.get('common', 'build'), shared.config.get('common', 'files'), name ) exists = os.path.isfile(fpath) # in case there is a downsized image compare against the main file's # mtime and invalidate the existing if it's older if exists: mtime = os.path.getmtime(fpath) if self.mtime > mtime: exists = False sizes.append(( int(size), { 'fpath': fpath, 'exists': os.path.isfile(fpath), 'url': "%s/%s/%s" % ( shared.config.get('site', 'url'), shared.config.get('common', 'files'), name ), 'crop': shared.config.getboolean( 'crop', size, fallback=False ) } )) return sorted(sizes, reverse=False) @property def is_downsizeable(self): """ Check if the image is large enought to downsize it """ ftype = self.meta.get('FileType', None) if not ftype: return False elif ftype.lower() != 'jpeg' and ftype.lower() != 'png': return False _max = max( int(self.meta.get('ImageWidth')), int(self.meta.get('ImageHeight')) ) _min = shared.config.getint('photo','default') if _max > _min: return True return False def _maybe_watermark(self, img): """ Composite image by adding watermark file over it """ if not self.is_photo: logging.debug("not watermarking: not a photo") return img wmarkfile = shared.config.get('photo', 'watermark') if not os.path.isfile(wmarkfile): logging.debug("not watermarking: watermark not found") return img logging.debug("%s is a photo, applying watermarking", self.fpath) with wand.image.Image(filename=wmarkfile) as wmark: if img.width > img.height: w = img.width * 0.2 h = wmark.height * (w / wmark.width) x = img.width - w - (img.width * 0.01) y = img.height - h - (img.height * 0.01) else: w = img.height * 0.16 h = wmark.height * (w / wmark.width) x = img.width - h - (img.width * 0.01) y = img.height - w - (img.height * 0.01) w = round(w) h = round(h) x = round(x) y = round(y) wmark.resize(w, h) if img.width <= img.height: wmark.rotate(-90) img.composite(image=wmark, left=x, top=y) return img def _copy(self): fname = "%s%s" % (self.fname, self.fext) logging.info("copying %s to build dir", fname) fpath = os.path.join( shared.config.get('common', 'build'), shared.config.get('common', 'files'), fname ) if os.path.isfile(fpath): mtime = os.path.getmtime(fpath) if self.mtime <= mtime: return shutil.copy(self.fpath, fpath) def _intermediate_dimension(self, size, width, height, crop=False): """ Calculate intermediate resize dimension and return a tuple of width, height """ size = int(size) if (width > height and not crop) \ or (width < height and crop): w = size h = int(float(size / width) * height) else: h = size w = int(float(size / height) * width) return (w, h) def _intermediate(self, img, size, target, crop=False): if img.width <= size and img.height <= size: return False with img.clone() as thumb: width, height = self._intermediate_dimension( size, img.width, img.height, crop ) thumb.resize(width, height) if crop: thumb.liquid_rescale(size, size, 1, 1) if self.meta.get('FileType', 'jpeg').lower() == 'jpeg': thumb.compression_quality = 86 thumb.unsharp_mask( radius=0, sigma=0.5, amount=1, threshold=0.03 ) thumb.format = 'pjpeg' # this is to make sure pjpeg happens with open(target, 'wb') as f: logging.info("writing %s", target) thumb.save(file=f) @property def needs_downsize(self): needed = False for (size, downsized) in self.sizes: if downsized.get('exists', False): logging.debug("size %d exists: %s", size, downsized.get('fpath')) continue logging.debug("size %d missing: %s", size, downsized.get('fpath')) needed = True return needed async def downsize(self): if not self.is_downsizeable: return self._copy() if not self.needs_downsize and not shared.config.getboolean('params', 'regenerate'): return build_files = os.path.join( shared.config.get('common', 'build'), shared.config.get('common', 'files'), ) if not os.path.isdir(build_files): os.makedirs(build_files) logging.info("downsizing %s%s", self.fname, self.fext) with wand.image.Image(filename=self.fpath) as img: img.auto_orient() img = self._maybe_watermark(img) for (size, downsized) in self.sizes: self._intermediate( img, size, downsized['fpath'], downsized['crop'] ) @property def tmplvars(self): return { 'src': self.src, 'target': self.href, 'css': self.cssclass, 'title': self.title, 'alt': self.alt, 'exif': self.exif, 'is_photo': self.is_photo, 'author': self.meta.get('Artist', ''), } def __repr__(self): return "Image: %s, photo: %r, EXIF: %s" % ( self.fname, self.is_photo, self.exif ) def __str__(self): tmplfile = "%s.html" % (__class__.__name__) return shared.j2.get_template(tmplfile).render({'photo': self.tmplvars}) class Comment(object): def __init__(self, fpath): logging.debug("initiating comment object from %s", fpath) self.fpath = fpath self.mtime = os.path.getmtime(self.fpath) with open(self.fpath, mode='rt') as f: self.fm = frontmatter.parse(f.read()) self.meta, self.content = self.fm @property def dt(self): return arrow.get(self.meta.get('date')) @property def html(self): html = "%s" % (self.content) return shared.Pandoc().convert(html) @property def target(self): t = urlparse(self.meta.get('target')) return t.path.rstrip('/').strip('/').split('/')[-1] @property def source(self): return self.meta.get('source') @property def author(self): url = self.meta.get('author').get('url', self.source) name = self.meta.get('author').get('name', urlparse(url).hostname) return { 'name': name, 'url': url } @property def type(self): # caching, because calling Pandoc is expensive if not hasattr(self, '_type'): self._type = 'webmention' t = self.meta.get('type', 'webmention') if 'webmention' != t: self._type = '★' if len(self.content): maybe = shared.Pandoc('plain').convert(self.content) if maybe in UNICODE_EMOJI: self._type = maybe return self._type @property def tmplvars(self): if not hasattr(self, '_tmplvars'): self._tmplvars = { 'author': self.author, 'source': self.source, 'pubtime': self.dt.format(shared.ARROWFORMAT['iso']), 'pubdate': self.dt.format(shared.ARROWFORMAT['display']), 'html': self.html, 'type': self.type } return self._tmplvars def __repr__(self): return "Comment from %s for %s" % ( self.source, self.target ) def __str__(self): tmplfile = "%s.html" % (__class__.__name__) return shared.j2.get_template(tmplfile).render({'comment': self.tmplvars}) #class SendWebmention(object): ## TODO def __init__(self, source, target): ## check in gone.tsv? ## discover endpoint ## send webmention ## add to DB on return #def run(self): #return #class ReceiveWebmention(object): ## TODO def __init__(self, source, target): ## pull remote ## validate if page links to X anywhere ## find h-entry or use root as SOURCE ## find author in SOURCE ## find content in SOURCE ## save under comments/[target slug]/mtime-[from-slufigied-url].md ## ## add to DB on return #def run(self): #return #def parse_received_queue(): # iterate over DB received #def parse_send_queue(): # iterate over DB needs sending #def webmentions(target_slug): # find all webmentions in the relevant directory # return mtime => Webmention hash def setup(): """ parse input parameters and add them as params section to config """ parser = argparse.ArgumentParser(description='Parameters for NASG') booleanparams = { 'regenerate': 'force downsizing images', 'force': 'force rendering HTML', } for k, v in booleanparams.items(): parser.add_argument( '--%s' % (k), action='store_true', default=False, help = v ) parser.add_argument( '--loglevel', default='warning', help='change loglevel' ) if not shared.config.has_section('params'): shared.config.add_section('params') params = vars(parser.parse_args()) for k, v in params.items(): shared.config.set('params', k, str(v)) # remove the rest of the potential loggers while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging.basicConfig( level=shared.LLEVEL[shared.config.get('params', 'loglevel')], format='%(asctime)s - %(levelname)s - %(message)s' ) def build(): setup() loop = asyncio.get_event_loop() tasks = [] content = Content() sdb = db.SearchDB() magic = MagicPHP() collector_front = Category() collector_categories = NoDupeContainer() for f, post in content: logging.info("PARSING %s", f) # extend redirects for r in post.redirects: magic.redirects.append((r, post.fname)) # add post to search, if needed if not sdb.is_uptodate(post.fname, post.mtime): sdb.append( post.fname, post.corpus, post.mtime, post.url, post.category, post.title ) # add render task, if needed if not post.is_uptodate or shared.config.get('params', 'force'): task = loop.create_task(post.render()) tasks.append(task) # collect images to downsize for fname, im in post.images: task = loop.create_task(im.downsize()) tasks.append(task) # skip categories starting with _ if post.category.startswith('_'): continue # get the category otherwise elif post.category not in collector_categories : c = Category(post.category) collector_categories.append(post.category, c) else: c = collector_categories[post.category] # add post to category c.append(post) # add post to front collector_front.append(post) # write search db sdb.finish() # render front task = loop.create_task(collector_front.render()) tasks.append(task) # render categories for name, c in collector_categories: task = loop.create_task(c.render()) tasks.append(task) # add magic.php rendering task = loop.create_task(magic.render()) tasks.append(task) # TODO: send webmentions to any url # TODO: comments # TODO: ping websub? # do all the things! w = asyncio.wait(tasks) loop.run_until_complete(w) loop.close() # copy static logging.info('copying static files') src = shared.config.get('dirs', 'static') for item in os.listdir(src): s = os.path.join(src,item) d = os.path.join(shared.config.get('common', 'build'),item) if not os.path.exists(d): logging.debug("copying static file %s to %s", s, d) shutil.copy2(s, d) if __name__ == '__main__': build()