#!/usr/bin/env python3 # -*- coding: utf-8 -*- # vim: set fileencoding=utf-8 : __author__ = "Peter Molnar" __copyright__ = "Copyright 2017-2018, Peter Molnar" __license__ = "GPLv3" __version__ = "2.2.0" __maintainer__ = "Peter Molnar" __email__ = "mail@petermolnar.net" __status__ = "Production" """ silo archiver module of NASG Copyright (C) 2017-2018 Peter Molnar This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA """ import os import re import logging import json import glob import argparse import shutil from urllib.parse import urlparse import asyncio from math import ceil import csv import html import frontmatter import requests import arrow import langdetect import wand.image from emoji import UNICODE_EMOJI from feedgen.feed import FeedGenerator import shared class MagicPHP(object): ''' router PHP generator ''' name = 'index.php' def __init__(self): # init 'gone 410' array self.gones = [] f_gone = shared.config.get('var', 'gone') if os.path.isfile(f_gone): with open(f_gone) as csvfile: reader = csv.reader(csvfile, delimiter=' ') for row in reader: self.gones.append(row[0]) # init manual redirects array self.redirects = [] f_redirect = shared.config.get('var', 'redirects') if os.path.isfile(f_redirect): with open(f_redirect) as csvfile: reader = csv.reader(csvfile, delimiter=' ') for row in reader: self.redirects.append((row[0], row[1])) @property def phpfile(self): return os.path.join( shared.config.get('common', 'build'), self.name ) async def render(self): logging.info('saving %s', self.name) o = self.phpfile tmplfile = "%s.html" % (self.__class__.__name__) r = shared.j2.get_template(tmplfile).render({ 'site': shared.site, 'redirects': self.redirects, 'gones': self.gones }) with open(o, 'wt') as out: logging.debug('writing file %s', o) out.write(r) class NoDupeContainer(object): ''' Base class to hold keys => data dicts with errors on dupes ''' def __init__(self): self.data = {} self.default = None def append(self, key, value): # all clear if key not in self.data: self.data.update({key: value}) return # problem logging.error( "duplicate key error when populating %s: %s", self.__class__.__name__, key ) logging.error( "current: %s", self.data.get(key) ) logging.error( "problem: %s", value ) return # TODO: return ordered version of data def __getitem__(self, key): return self.data.get(key, self.default) def __setitem__(self, key, value): return self.append(key, value) def __contains__(self, key): if key in self.data.keys(): return True return False def __len__(self): return len(self.data.keys()) def __next__(self): try: r = self.data.next() except BaseException: raise StopIteration() return r def __iter__(self): for k, v in self.data.items(): yield (k, v) return class FContainer(NoDupeContainer): """ This is a container that holds a lists of files based on Container so it errors on duplicate slugs and is popolated with recorsive glob """ def __init__(self, dirs, extensions=['*']): super().__init__() files = [] for ext in extensions: for p in dirs: files.extend(glob.iglob( os.path.join(p, '*.%s' % (ext)), recursive=True )) # eliminate duplicates files = list(set(files)) for fpath in files: fname = os.path.basename(fpath) self.append(fname, fpath) class Content(FContainer): """ This is a container that holds markdown files that are parsed when the container is populated on the fly; based on FContainer which is a Container """ def __init__(self): dirs = [os.path.join(shared.config.get('dirs', 'content'), "**")] extensions = ['md', 'jpg'] super().__init__(dirs, extensions) for fname, fpath in self.data.items(): self.data.update({fname: Singular(fpath)}) class Category(NoDupeContainer): """ A Category which holds pubtime (int) => Singular data """ indexfile = 'index.html' feedfile = 'index.xml' feeddir = 'feed' pagedir = 'page' taxonomy = 'category' def __init__(self, name='', is_front=False): self.name = name self.topics = NoDupeContainer() self.is_front = is_front super().__init__() def append(self, post): if len(post.tags) == 1: topic = post.tags[0] if topic not in self.topics: t = NoDupeContainer() self.topics.append(topic, t) t = self.topics[topic] t.append(post.pubtime, post) return super().append(post.pubtime, post) @property def mtime(self): return int(sorted(self.data.keys(), reverse=True)[0]) @property def is_uptodate(self): index = os.path.join(self.path_paged(), self.indexfile) if not os.path.isfile(index): return False mtime = os.path.getmtime(index) if mtime == self.mtime: return True return False @property def title(self): return ' - '.join([ self.name, shared.config.get('common', 'domain') ]) @property def is_altrender(self): return os.path.exists( os.path.join( shared.config.get('dirs', 'tmpl'), "%s_%s.html" % ( self.__class__.__name__, self.name ) ) ) @property def url(self): if self.name: url = "/%s/%s/" % ( self.taxonomy, self.name, ) else: url = '/' return url def path_paged(self, page=1, feed=False): x = shared.config.get('common', 'build') if self.name: x = os.path.join( x, self.taxonomy, self.name, ) if page == 1: if feed: x = os.path.join(x, self.feeddir) else: x = os.path.join(x, self.pagedir, "%s" % page) if not os.path.isdir(x): os.makedirs(x) return x def write_html(self, path, content): with open(path, 'wt') as out: logging.debug('writing file %s', path) out.write(content) os.utime(path, (self.mtime, self.mtime)) async def render(self): if self.is_altrender: self.render_onepage() else: self.render_paginated() self.render_feed() def render_onepage(self): years = {} for k in list(sorted(self.data.keys(), reverse=True)): post = self.data[k] year = int(arrow.get(post.pubtime).format('YYYY')) if year not in years: years.update({year: []}) years[year].append(post.tmplvars) tmplvars = { 'taxonomy': { 'add_welcome': self.is_front, 'title': self.title, 'name': self.name, 'lastmod': arrow.get(self.mtime).format( shared.ARROWFORMAT['rcf'] ), 'url': self.url, 'feed': "%s/%s/" % ( self.url, shared.config.get('site', 'feed') ), }, 'site': shared.site, 'by_year': years } dirname = self.path_paged(1) o = os.path.join(dirname, self.indexfile) logging.info( "Rendering category %s to %s", self.name, o ) tmplfile = "%s_%s.html" % ( self.__class__.__name__, self.name ) r = shared.j2.get_template(tmplfile).render(tmplvars) self.write_html(o, r) def render_feed(self): start = 0 end = int(shared.config.getint('display', 'pagination')) posttmpls = [ self.data[k].tmplvars for k in list(sorted( self.data.keys(), reverse=True ))[start:end] ] dirname = self.path_paged(1, feed=True) o = os.path.join(dirname, self.feedfile) logging.info( "Rendering feed of category %s to %s", self.name, o ) flink = "%s%s%s" % ( shared.config.get('site', 'url'), self.url, shared.config.get('site', 'feed') ) fg = FeedGenerator() fg.id(flink) fg.link( href=flink, rel='self' ) fg.title(self.title) fg.author({ 'name': shared.site.get('author').get('name'), 'email': shared.site.get('author').get('email') }) fg.logo('%s/favicon.png' % shared.site.get('url')) fg.updated(arrow.get(self.mtime).to('utc').datetime) for p in reversed(posttmpls): link = '%s/%s/' % (shared.site.get('url'), p.get('slug')) dt = arrow.get(p.get('pubtime')).to('utc') content = p.get('html') if p.get('photo'): content = "%s\n\n%s" % (p.get('photo'), content) fe = fg.add_entry() fe.id(link) fe.link(href=link) fe.title(p.get('title')) fe.published(dt.datetime) fe.updated(dt.datetime) fe.content( content, type='CDATA' ) fe.rights('%s %s %s' % ( dt.format('YYYY'), shared.site.get('author').get('name'), p.get('licence').get('text') )) if p.get('enclosure'): enclosure = p.get('enclosure') fe.enclosure( enclosure.get('url'), "%d" % enclosure.get('size'), enclosure.get('mime') ) with open(o, 'wb') as f: f.write(fg.atom_str(pretty=True)) #with open(o.replace('.xml', '.rss'), 'wb') as f: #f.write(fg.rss_str(pretty=True)) # ping pubsub r = requests.post( shared.site.get('websub').get('hub'), data={ 'hub.mode': 'publish', 'hub.url': flink } ) logging.info(r.text) def render_paginated(self): pagination = shared.config.getint('display', 'pagination') pages = ceil(len(self.data) / pagination) page = 1 while page <= pages: add_welcome = False if (self.is_front and page == 1): add_welcome = True # list relevant post templates start = int((page - 1) * pagination) end = int(start + pagination) posttmpls = [ self.data[k].tmplvars for k in list(sorted( self.data.keys(), reverse=True ))[start:end] ] # define data for template # TODO move the pagination links here, the one in jinja # is overcomplicated tmplvars = { 'taxonomy': { 'add_welcome': add_welcome, 'title': self.title, 'name': self.name, 'page': page, 'total': pages, 'perpage': pagination, 'lastmod': arrow.get(self.mtime).format( shared.ARROWFORMAT['rcf'] ), 'url': self.url, 'feed': "%s/%s/" % ( self.url, shared.config.get('site', 'feed') ), }, 'site': shared.site, 'posts': posttmpls, } # render HTML dirname = self.path_paged(page) o = os.path.join(dirname, self.indexfile) logging.info( "Rendering page %d/%d of category %s to %s", page, pages, self.name, o ) tmplfile = "%s.html" % (self.__class__.__name__) r = shared.j2.get_template(tmplfile).render(tmplvars) self.write_html(o, r) page = page + 1 class Singular(object): indexfile = 'index.html' def __init__(self, fpath): logging.debug("initiating singular object from %s", fpath) self.fpath = fpath self.mtime = os.path.getmtime(self.fpath) self.stime = self.mtime self.fname, self.fext = os.path.splitext(os.path.basename(self.fpath)) self.category = os.path.basename(os.path.dirname(self.fpath)) self._images = NoDupeContainer() if self.fext == '.md': with open(self.fpath, mode='rt') as f: self.fm = frontmatter.parse(f.read()) self.meta, self.content = self.fm self.photo = None elif self.fext == '.jpg': self.photo = WebImage(self.fpath) self.meta = self.photo.fm_meta self.content = self.photo.fm_content self.photo.inline = False self.photo.cssclass = 'u-photo' def init_extras(self): self.receive_webmentions() c = self.comments # note: due to SQLite locking, this will not be async for now def receive_webmentions(self): wdb = shared.WebmentionQueue() queued = wdb.get_queued(self.url) for incoming in queued: wm = Webmention( incoming.get('source'), incoming.get('target'), incoming.get('dt') ) wm.receive() wdb.entry_done(incoming.get('id')) wdb.finish() def queue_webmentions(self): if self.is_future: return wdb = shared.WebmentionQueue() for target in self.urls_to_ping: if not wdb.exists(self.url, target, self.published): wdb.queue(self.url, target) else: logging.debug("not queueing - webmention already queued from %s to %s", self.url, target) wdb.finish() @property def urls_to_ping(self): urls = [x.strip() for x in shared.REGEX.get('urls').findall(self.content)] if self.is_reply: urls.append(self.is_reply) for url in self.syndicate: urls.append(url) r = {} for link in urls: parsed = urlparse(link) if parsed.netloc in shared.config.get('site', 'domains'): continue if link in r: continue r.update({link: True}) return r.keys() @property def redirects(self): r = self.meta.get('redirect', []) r.append(self.shortslug) return list(set(r)) @property def is_uptodate(self): for f in [self.htmlfile]: if not os.path.isfile(f): return False mtime = os.path.getmtime(f) if mtime < self.stime: return False return True @property def htmlfile(self): return os.path.join( shared.config.get('common', 'build'), self.fname, self.indexfile ) @property def images(self): if self.photo: self._images.append(self.fname, self.photo) # add inline images for shortcode, alt, fname, title, css in self.inline_images: # this does the appending automatically im = self._find_image(fname) return self._images @property def comments(self): comments = NoDupeContainer() cfiles = [] lookin = [*self.redirects, self.fname] for d in lookin: maybe = glob.glob( os.path.join( shared.config.get('dirs', 'comment'), d, '*.md' ) ) cfiles = [*cfiles, *maybe] for cpath in cfiles: cmtime = os.path.getmtime(cpath) if cmtime > self.stime: self.stime = cmtime c = Comment(cpath) comments.append(c.mtime, c) return comments @property def replies(self): r = {} for mtime, c in self.comments: if c.type == 'webmention': r.update({mtime: c.tmplvars}) return sorted(r.items()) @property def reactions(self): r = {} for mtime, c in self.comments: if c.type == 'webmention': continue if c.type not in r: r[c.type] = {} r[c.type].update({mtime: c.tmplvars}) for icon, comments in r.items(): r[icon] = sorted(comments.items()) return r @property def exif(self): if not self.photo: return {} return self.photo.exif @property def published(self): return arrow.get(self.meta.get('published', self.mtime)) @property def updated(self): u = self.meta.get('updated', False) if u: u = arrow.get(u) return u @property def pubtime(self): return int(self.published.timestamp) @property def is_reply(self): return self.meta.get('in-reply-to', False) @property def is_future(self): now = arrow.utcnow().timestamp if self.pubtime > now: return True return False @property def licence(self): l = shared.config.get( 'licence', self.category, fallback=shared.config.get('licence', 'default',) ) return { 'text': 'CC %s 4.0' % l.upper(), 'url': 'https://creativecommons.org/licenses/%s/4.0/' % l, } @property def corpus(self): corpus = "\n".join([ "%s" % self.meta.get('title', ''), "%s" % self.fname, "%s" % self.meta.get('summary', ''), "%s" % self.content, ]) if self.photo: corpus = corpus + "\n".join(self.tags) return corpus @property def lang(self): # default is English, this will only be changed if the try # succeeds and actually detects a language lang = 'en' try: lang = langdetect.detect("\n".join([ self.fname, self.meta.get('title', ''), self.content ])) except BaseException: pass return lang def _find_image(self, fname): fname = os.path.basename(fname) pattern = os.path.join( shared.config.get('dirs', 'files'), '**', fname ) logging.debug('trying to locate image %s in %s', fname, pattern) maybe = glob.glob(pattern) if not maybe: logging.error('image not found: %s', fname) return None maybe = maybe.pop() logging.debug('image found: %s', maybe) if fname not in self._images: im = WebImage(maybe) self._images.append(fname, im) return self._images[fname] @property def inline_images(self): return shared.REGEX['mdimg'].findall(self.content) @property def url(self): return "%s/%s/" % (shared.config.get('site', 'url'), self.fname) @property def body(self): body = "%s" % (self.content) # get inline images, downsize them and convert them to figures for shortcode, alt, fname, title, css in self.inline_images: #fname = os.path.basename(fname) im = self._find_image(fname) if not im: continue im.alt = alt im.title = title im.cssclass = css body = body.replace(shortcode, str(im)) return body @property def html(self): html = "%s" % (self.body) return shared.Pandoc().convert(html) @property def title(self): maybe = self.meta.get('title', False) if maybe: return maybe if self.is_reply: return "RE: %s" % self.is_reply return self.published.format(shared.ARROWFORMAT['display']) @property def review(self): return self.meta.get('review', False) @property def summary(self): s = self.meta.get('summary', '') if not s: return s if not hasattr(self, '_summary'): self._summary = shared.Pandoc().convert(s) return self._summary @property def shortslug(self): return shared.baseN(self.pubtime) @property def syndicate(self): urls = self.meta.get('syndicate', []) if self.photo and self.photo.is_photo: urls.append("https://brid.gy/publish/flickr") return urls @property def tags(self): return self.meta.get('tags', []) @property def description(self): return html.escape(self.meta.get('summary', '')) @property def tmplvars(self): # very simple caching because we might use this 4 times: # post HTML, category, front posts and atom feed if not hasattr(self, '_tmplvars'): self._tmplvars = { 'title': self.title, 'pubtime': self.published.format( shared.ARROWFORMAT['iso'] ), 'pubdate': self.published.format( shared.ARROWFORMAT['display'] ), 'pubrfc': self.published.format( shared.ARROWFORMAT['rcf'] ), 'category': self.category, 'html': self.html, 'lang': self.lang, 'slug': self.fname, 'shortslug': self.shortslug, 'licence': self.licence, 'is_reply': self.is_reply, 'age': int(self.published.format('YYYY')) - int(arrow.utcnow().format('YYYY')), 'summary': self.summary, 'description': self.description, 'replies': self.replies, 'reactions': self.reactions, 'syndicate': self.syndicate, 'tags': self.tags, 'photo': False, 'enclosure': False, 'review': self.review } if self.photo: self._tmplvars.update({ 'photo': str(self.photo), 'enclosure': { 'mime': self.photo.mime_type, 'size': self.photo.mime_size, 'url': self.photo.href } }) return self._tmplvars async def render(self): logging.info('rendering %s', self.fname) o = self.htmlfile tmplfile = "%s.html" % (self.__class__.__name__) r = shared.j2.get_template(tmplfile).render({ 'post': self.tmplvars, 'site': shared.site, }) d = os.path.dirname(o) if not os.path.isdir(d): logging.debug('creating directory %s', d) os.makedirs(d) with open(o, 'wt') as out: logging.debug('writing file %s', o) out.write(r) # use the comment time, not the source file time for this os.utime(o, (self.stime, self.stime)) def __repr__(self): return "%s/%s" % (self.category, self.fname) class WebImage(object): def __init__(self, fpath): logging.info("parsing image: %s", fpath) self.fpath = fpath self.mtime = os.path.getmtime(self.fpath) bname = os.path.basename(fpath) self.fname, self.fext = os.path.splitext(bname) self.title = '' self.alt = bname self.target = '' self.cssclass = '' @property def fm_content(self): return self.meta.get('Description', '') @property def fm_meta(self): return { 'published': self.meta.get( 'ReleaseDate', self.meta.get('ModifyDate') ), 'title': self.meta.get('Headline', self.fname), 'tags': list(set(self.meta.get('Subject', []))), } @property def mime_type(self): return str(self.meta.get('MIMEType', 'image/jpeg')) @property def mime_size(self): if self.is_downsizeable: try: return int(self.sizes[-1][1]['fsize']) except Exception as e: pass return int(self.meta.get('FileSize')) @property def href(self): if len(self.target): return self.target if not self.is_downsizeable: return False return self.sizes[-1][1]['url'] @property def src(self): # is the image is too small to downsize, it will be copied over # so the link needs to point at src = "/%s/%s" % ( shared.config.get('common', 'files'), "%s%s" % (self.fname, self.fext) ) if self.is_downsizeable: try: src = [ e for e in self.sizes if e[0] == shared.config.getint('photo', 'default') ][0][1]['url'] except BaseException: pass return src @property def meta(self): if not hasattr(self, '_exif'): # reading EXIF is expensive enough even with a static generator # to consider caching it, so I'll do that here cpath = os.path.join( shared.config.get('var', 'cache'), "%s.exif.json" % self.fname ) if os.path.exists(cpath): cmtime = os.path.getmtime(cpath) if cmtime >= self.mtime: with open(cpath, 'rt') as f: self._exif = json.loads(f.read()) return self._exif self._exif = shared.ExifTool(self.fpath).read() if not os.path.isdir(shared.config.get('var', 'cache')): os.makedirs(shared.config.get('var', 'cache')) with open(cpath, 'wt') as f: f.write(json.dumps(self._exif)) return self._exif @property def is_photo(self): # missing regex from config if 'photo' not in shared.REGEX: logging.debug('%s photo regex missing from config') return False cpr = self.meta.get('Copyright', '') art = self.meta.get('Artist', '') # both Artist and Copyright missing from EXIF if not cpr and not art: logging.debug('%s Artist or Copyright missing from EXIF') return False # we have regex, Artist and Copyright, try matching them pattern = re.compile(shared.config.get('photo', 'regex')) if pattern.search(cpr) or pattern.search(art): return True logging.debug('%s patterns did not match') return False @property def exif(self): exif = {} if not self.is_photo: return exif mapping = { 'camera': ['Model'], 'aperture': ['FNumber', 'Aperture'], 'shutter_speed': ['ExposureTime'], #'focallength': ['FocalLengthIn35mmFormat', 'FocalLength'], 'focallength': ['FocalLength'], 'iso': ['ISO'], 'lens': ['LensID', 'LensSpec', 'Lens'], 'geo_latitude': ['GPSLatitude'], 'geo_longitude': ['GPSLongitude'], } for ekey, candidates in mapping.items(): for candidate in candidates: maybe = self.meta.get(candidate, None) if not maybe: continue elif 'geo_' in ekey: exif[ekey] = round(float(maybe), 5) else: exif[ekey] = maybe break return exif @property def sizes(self): sizes = [] _max = max( int(self.meta.get('ImageWidth')), int(self.meta.get('ImageHeight')) ) for size in shared.config.options('downsize'): if _max < int(size): continue name = '%s_%s%s' % ( self.fname, shared.config.get('downsize', size), self.fext ) fpath = os.path.join( shared.config.get('common', 'build'), shared.config.get('common', 'files'), name ) exists = os.path.isfile(fpath) # in case there is a downsized image compare against the main # file's mtime and invalidate the existing if it's older if exists: mtime = os.path.getmtime(fpath) if self.mtime > mtime: exists = False smeta = { 'fpath': fpath, 'exists': False, 'url': "%s/%s/%s" % ( shared.config.get('site', 'url'), shared.config.get('common', 'files'), name ), 'crop': shared.config.getboolean( 'crop', size, fallback=False ), 'fsize': int(self.meta.get('FileSize')) } if os.path.isfile(fpath): smeta.update({ 'exists': True, 'fsize': os.path.getsize(fpath) }) sizes.append(( int(size), smeta )) return sorted(sizes, reverse=False) @property def is_downsizeable(self): """ Check if the image is large enought to downsize it """ ftype = self.meta.get('FileType', None) if not ftype: return False elif ftype.lower() != 'jpeg' and ftype.lower() != 'png': return False _max = max( int(self.meta.get('ImageWidth')), int(self.meta.get('ImageHeight')) ) _min = shared.config.getint('photo', 'default') if _max > _min: return True return False def _maybe_watermark(self, img): """ Composite image by adding watermark file over it """ if not self.is_photo: logging.debug("not watermarking: not a photo") return img wmarkfile = shared.config.get('photo', 'watermark') if not os.path.isfile(wmarkfile): logging.debug("not watermarking: watermark not found") return img logging.debug("%s is a photo, applying watermarking", self.fpath) with wand.image.Image(filename=wmarkfile) as wmark: if img.width > img.height: w = img.width * 0.2 h = wmark.height * (w / wmark.width) x = img.width - w - (img.width * 0.01) y = img.height - h - (img.height * 0.01) else: w = img.height * 0.16 h = wmark.height * (w / wmark.width) x = img.width - h - (img.width * 0.01) y = img.height - w - (img.height * 0.01) w = round(w) h = round(h) x = round(x) y = round(y) wmark.resize(w, h) if img.width <= img.height: wmark.rotate(-90) img.composite(image=wmark, left=x, top=y) return img def _copy(self): fname = "%s%s" % (self.fname, self.fext) fpath = os.path.join( shared.config.get('common', 'build'), shared.config.get('common', 'files'), fname ) if os.path.isfile(fpath): mtime = os.path.getmtime(fpath) if self.mtime <= mtime: return logging.info("copying %s to build dir", fname) shutil.copy(self.fpath, fpath) def _intermediate_dimension(self, size, width, height, crop=False): """ Calculate intermediate resize dimension and return a tuple of width, height """ size = int(size) if (width > height and not crop) \ or (width < height and crop): w = size h = int(float(size / width) * height) else: h = size w = int(float(size / height) * width) return (w, h) def _intermediate(self, img, size, target, crop=False): if img.width < size and img.height < size: return False with img.clone() as thumb: width, height = self._intermediate_dimension( size, img.width, img.height, crop ) thumb.resize(width, height) if crop: thumb.liquid_rescale(size, size, 1, 1) if self.meta.get('FileType', 'jpeg').lower() == 'jpeg': thumb.compression_quality = 94 thumb.unsharp_mask( radius=1, sigma=0.5, amount=0.7, threshold=0.5 ) thumb.format = 'pjpeg' # this is to make sure pjpeg happens with open(target, 'wb') as f: logging.info("writing %s", target) thumb.save(file=f) @property def needs_downsize(self): needed = False for (size, downsized) in self.sizes: if downsized.get('exists', False): logging.debug( "size %d exists: %s", size, downsized.get('fpath') ) continue logging.debug( "size %d missing: %s", size, downsized.get('fpath') ) needed = True return needed async def downsize(self): if not self.is_downsizeable: return self._copy() if not self.needs_downsize and not shared.config.getboolean( 'params', 'regenerate'): return build_files = os.path.join( shared.config.get('common', 'build'), shared.config.get('common', 'files'), ) if not os.path.isdir(build_files): os.makedirs(build_files) logging.info("downsizing %s%s", self.fname, self.fext) with wand.image.Image(filename=self.fpath) as img: img.auto_orient() img = self._maybe_watermark(img) for (size, downsized) in self.sizes: self._intermediate( img, size, downsized['fpath'], downsized['crop'] ) @property def src_size(self): width = int(self.meta.get('ImageWidth')) height = int(self.meta.get('ImageHeight')) if not self.is_downsizeable: return width, height return self._intermediate_dimension( shared.config.getint('photo', 'default'), width, height ) @property def tmplvars(self): src_width, src_height = self.src_size return { 'src': self.src, 'width': src_width, 'height': src_height, 'target': self.href, 'css': self.cssclass, 'title': self.title, 'alt': self.alt, 'exif': self.exif, 'is_photo': self.is_photo, 'author': self.meta.get('Artist', ''), } def __repr__(self): return "Image: %s, photo: %r, EXIF: %s" % ( self.fname, self.is_photo, self.exif ) def __str__(self): tmplfile = "%s.html" % (self.__class__.__name__) return shared.j2.get_template(tmplfile).render({ 'photo': self.tmplvars }) class Comment(object): def __init__(self, fpath): logging.debug("initiating comment object from %s", fpath) self.fpath = fpath self.mtime = os.path.getmtime(self.fpath) with open(self.fpath, mode='rt') as f: self.fm = frontmatter.parse(f.read()) self.meta, self.content = self.fm @property def dt(self): return arrow.get(self.meta.get('date')) @property def html(self): html = "%s" % (self.content) return shared.Pandoc().convert(html) @property def target(self): t = urlparse(self.meta.get('target')) return t.path.rstrip('/').strip('/').split('/')[-1] @property def source(self): return self.meta.get('source') @property def author(self): r = { 'name': urlparse(self.source).hostname, 'url': self.source } author = self.meta.get('author') if not author: return r if 'name' in author: r.update({ 'name': self.meta.get('author').get('name')}) elif 'url' in author: r.update({ 'name': urlparse(self.meta.get('author').get('url')).hostname}) return r @property def type(self): # caching, because calling Pandoc is expensive if not hasattr(self, '_type'): self._type = 'webmention' t = self.meta.get('type', 'webmention') if t != 'webmention': self._type = '★' if len(self.content): maybe = shared.Pandoc('plain').convert(self.content) if maybe in UNICODE_EMOJI: self._type = maybe return self._type @property def tmplvars(self): if not hasattr(self, '_tmplvars'): self._tmplvars = { 'author': self.author, 'source': self.source, 'pubtime': self.dt.format(shared.ARROWFORMAT['iso']), 'pubdate': self.dt.format(shared.ARROWFORMAT['display']), 'html': self.html, 'type': self.type } return self._tmplvars def __repr__(self): return "Comment from %s for %s" % ( self.source, self.target ) def __str__(self): tmplfile = "%s.html" % (__class__.__name__) return shared.j2.get_template(tmplfile).render({ 'comment': self.tmplvars }) class Webmention(object): def __init__(self, source, target, dt=arrow.utcnow().timestamp): self.source = source self.target = target self.dt = arrow.get(dt).to('utc') logging.info( "processing webmention %s => %s", self.source, self.target ) self._source = None def send(self): rels = shared.XRay(self.target).set_discover().parse() endpoint = False if 'rels' not in rels: logging.debug("no rel found for %s", self.target) return True for k in rels.get('rels').keys(): if 'webmention' in k: endpoint = rels.get('rels').get(k).pop() break if not endpoint: logging.debug("no endpoint found for %s", self.target) return True logging.info( "Sending webmention to endpoint: %s, source: %s, target: %s", endpoint, self.source, self.target, ) try: p = requests.post( endpoint, data={ 'source': self.source, 'target': self.target } ) if p.status_code == requests.codes.ok: logging.info("webmention sent") return True elif p.status_code == 400 and 'brid.gy' in self.target: logging.warning("potential bridgy duplicate: %s %s", p.status_code, p.text) return True else: logging.error("webmention failure: %s %s", p.status_code, p.text) return False except Exception as e: logging.error("sending webmention failed: %s", e) return False def receive(self): head = requests.head(self.source) if head.status_code == 410: self._delete() return elif head.status_code != requests.codes.ok: logging.error( "webmention source failure: %s %s", head.status_code, self.source ) return self._source = shared.XRay(self.source).parse() if 'data' not in self._source: logging.error("no data found in webmention source: %s", self.source) return self._save() def _delete(self): if os.path.isfile(self.fpath): logging.info("Deleting webmention %s", self.fpath) os.unlink(self.fpath) return def _save(self): fm = frontmatter.loads('') fm.content = self.content fm.metadata = self.meta with open(self.fpath, 'wt') as f: logging.info("Saving webmention to %s", self.fpath) f.write(frontmatter.dumps(fm)) return @property def relation(self): r = 'webmention' k = self._source.get('data').keys() for maybe in ['in-reply-to', 'repost-of', 'bookmark-of', 'like-of']: if maybe in k: r = maybe break return r @property def meta(self): if not hasattr(self, '_meta'): self._meta = { 'author': self._source.get('data').get('author'), 'type': self.relation, 'target': self.target, 'source': self.source, 'date': self._source.get('data').get('published'), } return self._meta @property def content(self): if 'content' not in self._source.get('data'): return '' elif 'html' in self._source.get('data').get('content'): what = self._source.get('data').get('content').get('html') elif 'text' in self._source.get('data').get('content'): what = self._source.get('data').get('content').get('text') else: return '' return shared.Pandoc('html').convert(what) @property def fname(self): return "%d-%s.md" % ( self.dt.timestamp, shared.slugfname(self.source) ) @property def fpath(self): tdir = os.path.join( shared.config.get('dirs', 'comment'), self.target.rstrip('/').strip('/').split('/')[-1] ) if not os.path.isdir(tdir): os.makedirs(tdir) return os.path.join( tdir, self.fname ) class Worker(object): def __init__(self): self._tasks = [] self._loop = asyncio.get_event_loop() def append(self, job): task = self._loop.create_task(job) self._tasks.append(task) def run(self): w = asyncio.wait(self._tasks) self._loop.run_until_complete(w) self._loop.close() def setup(): """ parse input parameters and add them as params section to config """ parser = argparse.ArgumentParser(description='Parameters for NASG') booleanparams = { 'regenerate': 'force downsizing images', 'force': 'force rendering HTML', } for k, v in booleanparams.items(): parser.add_argument( '--%s' % (k), action='store_true', default=False, help=v ) parser.add_argument( '--loglevel', default='warning', help='change loglevel' ) if not shared.config.has_section('params'): shared.config.add_section('params') params = vars(parser.parse_args()) for k, v in params.items(): shared.config.set('params', k, str(v)) # remove the rest of the potential loggers while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging.basicConfig( level=shared.LLEVEL[shared.config.get('params', 'loglevel')], format='%(asctime)s - %(levelname)s - %(message)s' ) def youngest_mtime(root): youngest = 0 files = glob.glob(os.path.join(root, '**'), recursive=True) for f in files: mtime = os.path.getmtime(f) if mtime > youngest: youngest = mtime return youngest def build(): setup() worker = Worker() content = Content() sdb = shared.SearchDB() magic = MagicPHP() collector_front = Category(is_front=True) collector_categories = NoDupeContainer() sitemap = {} for f, post in content: logging.info("PARSING %s", f) post.init_extras() post.queue_webmentions() # add to sitemap sitemap.update({ post.url: post.mtime }) # extend redirects for r in post.redirects: magic.redirects.append((r, post.fname)) # add post to search, if needed if not sdb.is_uptodate(post.fname, post.mtime): sdb.append( post.fname, post.corpus, post.mtime, post.url, post.category, post.title ) # add render task, if needed if not post.is_uptodate or shared.config.getboolean('params', 'force'): worker.append(post.render()) # collect images to downsize for fname, im in post.images: worker.append(im.downsize()) # skip adding future posts to any category if post.is_future: continue # skip categories starting with _ if post.category.startswith('_'): continue # get the category otherwise if post.category not in collector_categories: c = Category(post.category) collector_categories.append(post.category, c) else: c = collector_categories[post.category] # add post to category c.append(post) # add post to front collector_front.append(post) # write search db sdb.finish() # render front if not collector_front.is_uptodate or \ shared.config.getboolean('params', 'force'): worker.append(collector_front.render()) # render categories for name, c in collector_categories: if not c.is_uptodate or shared.config.getboolean('params', 'force'): worker.append(c.render()) # add magic.php rendering worker.append(magic.render()) # do all the things! worker.run() # send webmentions - this is synchronous due to the SQLite locking wdb = shared.WebmentionQueue() for out in wdb.get_outbox(): wm = Webmention( out.get('source'), out.get('target'), out.get('dt') ) if wm.send(): wdb.entry_done(out.get('id')) wdb.finish() # copy static logging.info('copying static files') src = shared.config.get('dirs', 'static') for item in os.listdir(src): s = os.path.join(src, item) stime = os.path.getmtime(s) d = os.path.join(shared.config.get('common', 'build'), item) dtime = 0 if os.path.exists(d): dtime = os.path.getmtime(d) if not os.path.exists(d) or shared.config.getboolean('params', 'force') or dtime < stime: logging.debug("copying static file %s to %s", s, d) shutil.copy2(s, d) if '.html' in item: url = "%s/%s" % (shared.config.get('site', 'url'), item) sitemap.update({ url: os.path.getmtime(s) }) # dump sitemap, if needed sitemapf = os.path.join(shared.config.get('common', 'build'), 'sitemap.txt') sitemap_update = True if os.path.exists(sitemapf): if int(max(sitemap.values())) <= int(os.path.getmtime(sitemapf)): sitemap_update = False if sitemap_update: logging.info('writing updated sitemap') with open(sitemapf, 'wt') as smap: smap.write("\n".join(sorted(sitemap.keys()))) if __name__ == '__main__': build()