#!/usr/bin/env python3 import os import re import configparser import argparse import shutil import logging import json import glob import tempfile import atexit import re import hashlib import math import asyncio import csv import getpass #import quopri #import base64 #import mimetypes import copy import magic import arrow import wand.image #import similar_text import frontmatter from slugify import slugify import langdetect import requests from whoosh import index from whoosh import qparser import jinja2 import urllib.parse from webmentiontools.send import WebmentionSend import bleach from emoji import UNICODE_EMOJI #from bs4 import BeautifulSoup #from readability.readability import Document import shared #import oauth def splitpath(path): parts = [] (path, tail) = os.path.split(path) while path and tail: parts.insert(0,tail) (path,tail) = os.path.split(path) return parts class BaseIter(object): def __init__(self): self.data = {} def append(self, key, value): if key in self.data: logging.warning("duplicate key: %s, using existing instead", key) existing = self.data.get(key) if hasattr(value, 'fname') and hasattr(existing, 'fname'): logging.warning( "%s collides with existing %s", value.fname, existing.fname ) return self.data[key] = value def __getitem__(self, key): return self.data.get(key, {}) def __repr__(self): return json.dumps(list(self.data.values())) def __next__(self): try: r = self.data.next() except: raise StopIteration() return r def __iter__(self): for k, v in self.data.items(): yield (k, v) return class BaseRenderable(object): def __init__(self): return def writerendered(self, content, mtime=None): mtime = mtime or self.mtime d = os.path.dirname(self.target) if not os.path.isdir(d): os.mkdir(d) with open(self.target, "w") as html: logging.debug('writing %s', self.target) html.write(content) html.close() os.utime(self.target, (mtime, mtime)) class Indexer(object): def __init__(self): self.target = os.path.abspath(os.path.join( shared.config.get('target', 'builddir'), shared.config.get('var', 'searchdb') )) if not os.path.isdir(self.target): os.mkdir(self.target) self.mtime = 0 if index.exists_in(self.target): self.ix = index.open_dir(self.target) tocfiles = glob.glob(os.path.join( self.target, "_MAIN_*.toc" )) if len(tocfiles): self.mtime = int(os.path.getmtime(tocfiles[0])) else: self.ix = index.create_in(self.target, shared.schema) self.writer = self.ix.writer() self.qp = qparser.QueryParser("url", schema=shared.schema) async def append(self, singular): if singular.isfuture: return logging.debug("searching for existing index for %s", singular.fname) if self.mtime >= singular.mtime: logging.debug("search index is newer than post mtime (%d vs %d), skipping post", self.mtime, singular.mtime) return exists = False q = self.qp.parse(singular.url) r = self.ix.searcher().search(q, limit=1) if r: r = r[0] # nothing to do, the entry is present and is up to date ixtime = r['mtime'] if int(ixtime) == int(singular.mtime): logging.info("search index is up to date for %s", singular.fname) return else: logging.info("search index is out of date: %d (indexed) vs %d", ixtime, singular.mtime) exists = True reactions = [] for t, v in singular.reactions.items(): if isinstance(v, list) and len(v): reactions = [*reactions, *v] content = "\n\n".join([ "\n\n".join([ bleach.clean(c, tags=[], strip_comments=True, strip=True) for c in [singular.sumhtml, singular.html] ]), "\n".join(["%s" % c for c in singular.tags]), "\n\n".join(reactions) ]) weight = 1 if singular.isbookmark or singular.isfav: weight = 10 if singular.ispage: weight = 100 if not len(singular.title): title = singular.fname else: title = singular.title if singular.photo: img = shared.Pandoc().convert("%s" % singular.photo) else: img = '' args = { 'url': singular.url, 'category': singular.category, 'date': singular.published.datetime, 'title': title, 'weight': weight, 'img': img, 'content': content, 'fuzzy': content, 'mtime': singular.mtime } if exists: logging.info("updating search index with %s", singular.fname) self.writer.add_document(**args) else: logging.info("appending search index with %s", singular.fname) self.writer.update_document(**args) def finish(self): self.writer.commit() class Renderer(object): def __init__(self): self.sitevars = dict(shared.config.items('site')) self.sitevars['author'] = dict(shared.config.items('author')) self.sitevars['author']['socials'] = dict(shared.config.items('socials')) self.sitevars['author']['qr'] = dict(shared.config.items('qr')) self.jinjaldr = jinja2.FileSystemLoader( searchpath=shared.config.get('source', 'templatesdir') ) self.j2 = jinja2.Environment(loader=self.jinjaldr) self.j2.filters['date'] = Renderer.jinja_filter_date self.j2.filters['search'] = Renderer.jinja_filter_search self.j2.filters['slugify'] = Renderer.jinja_filter_slugify @staticmethod def jinja_filter_date(d, form='%Y-%m-%d %H:%m:%S'): if d == 'now': d = arrow.now().datetime if form == 'c': return d.isoformat() #form = '%Y-%m-%dT%H:%M:%S%z' return d.strftime(form) @staticmethod def jinja_filter_slugify(s): return slugify(s, only_ascii=True, lower=True) @staticmethod def jinja_filter_search(s, r): if r in s: return True return False # based on http://stackoverflow.com/a/10075210 class ExifTool(shared.CMDLine): """ Handles calling external binary `exiftool` in an efficient way """ sentinel = "{ready}\n" def __init__(self): super().__init__('exiftool') def run(self, *filenames): return json.loads(self.execute( '-sort', '-json', '-MIMEType', '-FileType', '-FileName', '-ModifyDate', '-CreateDate', '-DateTimeOriginal', '-ImageHeight', '-ImageWidth', '-Aperture', '-FOV', '-ISO', '-FocalLength', '-FNumber', '-FocalLengthIn35mmFormat', '-ExposureTime', '-Copyright', '-Artist', '-Model', '-GPSLongitude#', '-GPSLatitude#', '-LensID', *filenames)) class Comment(BaseRenderable): def __init__(self, path): logging.debug("initiating comment object from %s", path) self.path = path self.fname, self.ext = os.path.splitext(os.path.basename(self.path)) self.mtime = int(os.path.getmtime(self.path)) self.meta = {} self.content = '' self.tmplfile = 'comment.html' self.__parse() def __repr__(self): return "%s" % (self.path) def __parse(self): with open(self.path, mode='rt') as f: self.meta, self.content = frontmatter.parse(f.read()) @property def reacji(self): if hasattr(self, '_reacji'): return self._reacji self._reacji = '' maybe = bleach.clean( self.content, tags=[], strip_comments=True, strip=True, ).strip() if not len(maybe): self._reacji = '★' elif maybe in UNICODE_EMOJI: self._reacji = maybe #t = self.meta.get('type', 'webmention') #typemap = { #'like-of': '👍', #'bookmark-of': '🔖', #'favorite': '★', #} #if t in typemap.keys(): #self._reacji = typemap[t] #else: return self._reacji @property def html(self): if hasattr(self, '_html'): return self._html tmp = shared.Pandoc().convert(self.content) self._html = bleach.clean(tmp, strip=True) return self._html @property def tmplvars(self): if hasattr(self, '_tmplvars'): return self._tmplvars self._tmplvars = { 'published': self.published.datetime, 'author': self.meta.get('author', {}), #'content': self.content, #'html': self.html, 'source': self.source, 'target': self.targeturl, 'type': self.meta.get('type', 'webmention'), 'reacji': self.reacji, 'fname': self.fname } return self._tmplvars @property def published(self): if hasattr(self, '_published'): return self._published self._published = arrow.get(self.meta.get('date', self.mtime)) return self._published @property def pubtime(self): return int(self.published.timestamp) @property def source(self): if hasattr(self, '_source'): return self._source s = self.meta.get('source', '') domains = shared.config.get('site', 'domains').split(' ') self._source = s for d in domains: if d in s: self._source = '' return self._source @property def targeturl(self): if hasattr(self, '_targeturl'): return self._targeturl t = self.meta.get('target', shared.config.get('site', 'url')) self._targeturl = '{p.path}'.format(p=urllib.parse.urlparse(t)).strip('/') return self._targeturl @property def target(self): if hasattr(self, '_target'): return self._target targetdir = os.path.abspath(os.path.join( shared.config.get('target', 'builddir'), shared.config.get('site', 'commentspath'), self.fname )) self._target = os.path.join(targetdir, 'index.html') return self._target async def render(self, renderer): logging.info("rendering and saving comment %s", self.fname) if not shared.config.getboolean('params', 'force') and os.path.isfile(self.target): ttime = int(os.path.getmtime(self.target)) logging.debug('ttime is %d mtime is %d', ttime, self.mtime) if ttime == self.mtime: logging.debug( '%s exists and up-to-date (lastmod: %d)', self.target, ttime ) return tmplvars = { 'reply': self.tmplvars, 'site': renderer.sitevars, 'taxonomy': {}, } r = renderer.j2.get_template(self.tmplfile).render(tmplvars) self.writerendered(r) class Comments(object): def __init__(self): self.files = glob.glob(os.path.join( shared.config.get('source', 'commentsdir'), "*.md" )) self.bytarget = {} def __getitem__(self, key): return self.bytarget.get(key, BaseIter()) def populate(self): for fpath in self.files: item = Comment(fpath) t = item.targeturl if not self.bytarget.get(t): self.bytarget[t] = BaseIter() self.bytarget[t].append(item.pubtime, item) class Images(BaseIter): def __init__(self, extensions=['jpg', 'gif', 'png']): super(Images, self).__init__() logging.info( "initiating images with extensions: %s", extensions ) self.files = [] self.data = {} # if anyone knows how to do this in a more pythonic way, please tell me paths = [ shared.config.get('source', 'filesdir'), shared.config.get('source', 'photosdir') ] for p in paths: for ext in extensions: self.files += glob.glob(os.path.join(p, "*.%s" % ext)) def populate(self): with ExifTool() as e: _meta = e.run(*self.files) # parsing the returned meta into a dict of [filename]={meta} for e in _meta: if 'FileName' not in e: logging.error("missing 'FileName' in element %s", e) continue fname = os.path.basename(e['FileName']) del(e['FileName']) # duplicate files are going to be a problem, so don't send it # away with a simple error log entry if fname in self.data: raise ValueError('filename collision: %s', fname) # convert dates for k, v in e.items(): e[k] = self.exifdate(v) self.data[fname] = WebImage(fname, e) def exifdate(self, value): """ converts and EXIF date string to ISO 8601 format :param value: EXIF date (2016:05:01 00:08:24) :type arg1: str :return: ISO 8601 string with UTC timezone 2016-05-01T00:08:24+0000 :rtype: str """ if not isinstance(value, str): return value match = shared.EXIFREXEG.match(value) if not match: return value return "%s-%s-%sT%s+0000" % ( match.group('year'), match.group('month'), match.group('day'), match.group('time') ) class WebImage(object): def __init__(self, fname, meta): logging.info( "parsing image: %s", fname ) self.meta = meta self.fpath = os.path.abspath(meta.get('SourceFile', fname)) self.fname, self.ext = os.path.splitext(fname) self.alttext = '' self.sizes = [] self.fallbacksize = int(shared.config.get('common','fallbackimg', fallback='720')) self.cl = '' self.singleimage = False for size in shared.config.options('downsize'): sizeext = shared.config.get('downsize', size) fname = "%s_%s%s" % (self.fname, sizeext, self.ext) self.sizes.append(( int(size), { 'fpath': os.path.join( shared.config.get('target', 'filesdir'), fname ), 'url': "/%s/%s" % ( #'url': "%s/%s/%s" % ( #shared.config.get('site', 'url'), shared.config.get('source', 'files'), fname ), 'crop': shared.config.getboolean('crop', size, fallback=False), } )) self.sizes = sorted(self.sizes, reverse=False) self.target = False if self.is_downsizeable: self.fallback = [e for e in self.sizes if e[0] == self.fallbacksize][0][1]['url'] self.small = [e for e in self.sizes if e[1]['crop'] == False][0][1]['url'] self.target = self.sizes[-1][1]['url'] else: self.small = self.fallback = "/%s/%s" % ( #self.small = self.fallback = "%s/%s/%s" % ( #shared.config.get('site', 'url'), shared.config.get('source', 'files'), "%s%s" % (self.fname, self.ext) ) def _setupvars(self): if self.is_downsizeable: if self.singleimage and not self.cl: self.cl = '.u-photo' elif self.singleimage: self.cl = '.u-photo %s' % self.cl else: if not self.cl: self.cl = '.aligncenter' @property def tmplvars(self): self._setupvars() if hasattr(self, '_tmplvars'): return self._tmplvars self._tmplvars = { 'alttext': self.alttext, 'fallback': self.fallback, 'small': self.small, 'title': "%s%s" % (self.fname, self.ext), 'target': self.target, 'cl': " ".join([s[1:] for s in self.cl.split()]), 'orientation': self.orientation } return self._tmplvars def __str__(self): self._setupvars() if self.is_downsizeable: #if self.singleimage and not self.cl: #self.cl = '.u-photo' #elif self.singleimage: #self.cl = '.u-photo %s' % self.cl return '[![%s](%s "%s%s"){.adaptimg}](%s){.adaptive %s}' % ( self.alttext, self.fallback, self.fname, self.ext, self.target, self.cl ) else: #if not self.cl: #self.cl = '.aligncenter' return '![%s](%s "%s%s"){%s}' % ( self.alttext, self.fallback, self.fname, self.ext, self.cl ) @property def exif(self): if not self.is_photo: return {} if hasattr(self, '_exif'): return self._exif exif = {} mapping = { 'camera': [ 'Model' ], 'aperture': [ 'FNumber', 'Aperture' ], 'shutter_speed': [ 'ExposureTime' ], 'focallength35mm': [ 'FocalLengthIn35mmFormat', ], 'focallength': [ 'FocalLength', ], 'iso': [ 'ISO' ], 'lens': [ 'LensID', ], 'date': [ 'CreateDate', 'DateTimeOriginal', ], 'geo_latitude': [ 'GPSLatitude' ], 'geo_longitude': [ 'GPSLongitude' ], } for ekey, candidates in mapping.items(): for candidate in candidates: maybe = self.meta.get(candidate, None) if maybe: if 'geo_' in ekey: exif[ekey] = round(float(maybe), 5) else: exif[ekey] = maybe break self._exif = exif return self._exif @property def orientation(self): width = int(self.meta.get('ImageWidth', 0)) height = int(self.meta.get('ImageHeight', 0)) if width >= height: return 'horizontal' return 'vertical' @property def rssenclosure(self): """ Returns the largest available image for RSS to add as attachment """ if hasattr(self, '_rssenclosure'): return self._rssenclosure target = self.sizes[-1][1] self._rssenclosure = { 'mime': magic.Magic(mime=True).from_file(target['fpath']), 'url': target['url'], 'size': os.path.getsize(target['fpath']), 'fname': self.fname } return self._rssenclosure @property def is_photo(self): if hasattr(self, '_is_photo'): return self._is_photo self._is_photo = False #if not pattern or not isinstance(pattern, str): # return False pattern = re.compile(shared.config.get('photo', 'regex')) cpr = self.meta.get('Copyright', '') art = self.meta.get('Artist', '') if not cpr and not art: return False if cpr and art: if pattern.search(cpr) or pattern.search(art): self._is_photo = True return self._is_photo @property def is_downsizeable(self): if hasattr(self, '_is_downsizeable'): return self._is_downsizeable self._is_downsizeable = False """ Check if the image is large enough and jpeg or png in order to downsize it """ fb = self.sizes[-1][0] ftype = self.meta.get('FileType', None) if not ftype: return self._is_downsizeable if ftype.lower() == 'jpeg' or ftype.lower() == 'png': width = int(self.meta.get('ImageWidth', 0)) height = int(self.meta.get('ImageHeight', 0)) if width > fb or height > fb: self._is_downsizeable = True return self._is_downsizeable def _copy(self): target = os.path.join( shared.config.get('target', 'filesdir'), "%s%s" % (self.fname, self.ext) ) if not os.path.isfile(target): logging.debug("can't downsize %s, copying instead" % self.fname) shutil.copy(self.fpath, target) def _watermark(self, img): """ Composite image by adding watermark file over it """ wmarkfile = os.path.join( shared.config.get('common', 'basedir'), shared.config.get('common', 'watermark') ) if not os.path.isfile(wmarkfile): return img with wand.image.Image(filename=wmarkfile) as wmark: if img.width > img.height: w = img.width * 0.16 h = wmark.height * (w / wmark.width) x = img.width - w - (img.width * 0.01) y = img.height - h - (img.height * 0.01) else: w = img.height * 0.16 h = wmark.height * (w / wmark.width) x = img.width - h - (img.width * 0.01) y = img.height - w - (img.height * 0.01) w = round(w) h = round(h) x = round(x) y = round(y) wmark.resize(w, h) if img.width <= img.height: wmark.rotate(-90) img.composite(image=wmark, left=x, top=y) return img def _intermediate_dimensions(self, size, width, height, crop = False): size = int(size) w = width h = height if (width > height and not crop) \ or (width < height and crop): w = size h = int(float(size / width) * height) else: h = size w = int(float(size / height) * width) return (w, h) def _intermediate(self, img, size, meta, existing = []): if img.width <= size and img.height <= size: return False crop = meta.get('crop', False) with img.clone() as thumb: width, height = self._intermediate_dimensions( size, img.width, img.height, crop ) thumb.resize(width, height) if crop: thumb.liquid_rescale(size, size, 1, 1) if self.meta.get('FileType', 'jpeg').lower() == 'jpeg': thumb.compression_quality = 86 thumb.unsharp_mask( radius=0, sigma=0.5, amount=1, threshold=0.03 ) thumb.format = 'pjpeg' # this is to make sure pjpeg happens with open(meta['fpath'], 'wb') as f: thumb.save(file=f) return True async def downsize(self, existing = []): if not self.is_downsizeable: self._copy() return logging.info("checking downsizing for %s", self.fname) needed = shared.config.getboolean('params', 'regenerate', fallback=False) if not needed: for (size, meta) in self.sizes: if meta['fpath'] not in existing: needed = True if not needed: logging.debug("downsizing not needed for %s", self.fname) return with wand.image.Image(filename=self.fpath) as img: img.auto_orient() if self.is_photo: logging.info("%s is a photo", self.fpath) img = self._watermark(img) for (size, meta) in self.sizes: self._intermediate(img, size, meta, existing) class Taxonomy(BaseIter): def __init__(self, name = None, taxonomy = None, slug = None): super(Taxonomy, self).__init__() self.name = name if name and not slug: self.slug = slugify(name, only_ascii=True, lower=True) else: self.slug = slug self.taxonomy = taxonomy #@property #def pages(self): #if hasattr(self, '_pages'): #return self._pages #self._pages = math.ceil(len(self.data) / shared.config.getint('common', 'pagination')) #return self._pages def __repr__(self): return "taxonomy %s with %d items" % (self.taxonomy, len(self.data)) @property def basep(self): p = shared.config.get('target', 'builddir') if self.taxonomy: p = os.path.join(p, self.taxonomy) if not os.path.isdir(p): os.mkdir(p) return p @property def myp(self): p = self.basep if self.slug: p = os.path.join(p,self.slug) if not os.path.isdir(p): os.mkdir(p) return p @property def feedp(self): p = os.path.join(self.myp, 'feed') if not os.path.isdir(p): os.mkdir(p) return p @property def pagep(self): p = os.path.join(self.myp, 'page') if not os.path.isdir(p): os.mkdir(p) return p @property def baseurl(self): if self.taxonomy and self.slug: return "/%s/%s/" % (self.taxonomy, self.slug) else: return '/' @property def mtime(self): if hasattr(self, '_mtime'): return self._mtime self._mtime = int(list(sorted(self.data.keys(), reverse=True))[0]) return self._mtime def __mkdirs(self): check = [self.basep, self.myp, self.feedp] for p in check: if not os.path.isdir(p): logging.debug("creating dir %s", p) os.mkdir(p) def tpath(self, page): if page == 1: p = "%s" % (self.myp) else: p = os.path.join(self.pagep, "%d" % page) if not os.path.isdir(p): logging.debug("creating dir %s", p) os.mkdir(p) return os.path.join(p, "index.html") @property def is_singlepage(self): spcats = shared.config.get('common', 'onepagecategories').split(',') if self.name in spcats and 'category' == self.taxonomy: return True return False def posttmpls(self, order='time', start=0, end=None): end = end or len(self.data) if 'all' == order: return [ i.tmplvars for k, i in list(sorted( self.data.items(), key=lambda value: value[1].title.lower() )) ] return [ self.data[k].tmplvars for k in list(sorted( self.data.keys(), reverse=True ))[start:end] ] async def render(self, renderer): #if not self.slug or self.slug is 'None': #return self.__mkdirs() page = 1 testpath = self.tpath(page) if not shared.config.getboolean('params', 'force') and os.path.isfile(testpath): ttime = int(os.path.getmtime(testpath)) mtime = self.mtime if ttime == mtime: logging.info('taxonomy index for "%s" exists and up-to-date (lastmod: %d)', self.slug, ttime) return else: logging.info('taxonomy update needed: %s timestamp is %d, last post timestamp is %d (%s)', testpath, ttime, mtime, self.data[mtime].fname ) if self.is_singlepage: pagination = len(self.data) else: pagination = shared.config.getint('common', 'pagination') pages = math.ceil(len(self.data) / pagination) while page <= pages: self.render_page(renderer, page, pagination, pages) page = page+1 self.render_feeds(renderer) self.ping_websub def render_feeds(self, renderer): pagination = shared.config.getint('common', 'pagination') start = 0 end = int(start + pagination) posttmpls = self.posttmpls('time', start, end) tmplvars = { 'taxonomy': { 'url': self.baseurl, 'name': self.name, 'slug': self.slug, 'taxonomy': self.taxonomy, 'lastmod': arrow.get(self.mtime).datetime }, 'site': renderer.sitevars, 'posts': posttmpls, } target = os.path.join(self.feedp, 'index.atom') logging.info("rendering Atom feed to %s", target) r = renderer.j2.get_template('atom.html').render(tmplvars) with open(target, "wt") as html: html.write(r) os.utime(target, (self.mtime, self.mtime)) def render_page(self, renderer, page, pagination, pages): if self.is_singlepage: posttmpls = self.posttmpls('all') else: start = int((page-1) * pagination) end = int(start + pagination) posttmpls = self.posttmpls('time', start, end) target = self.tpath(page) tdir = os.path.dirname(target) if not os.path.isdir(tdir): logging.debug("creating dir %s", tdir) os.mkdir(tdir) logging.info("rendering taxonomy page %d to %s", page, target) tmplvars = { 'taxonomy': { 'url': self.baseurl, 'name': self.name, 'slug': self.slug, 'taxonomy': self.taxonomy, 'paged': page, 'total': pages, 'perpage': pagination, 'lastmod': arrow.get(self.mtime).datetime }, 'site': renderer.sitevars, 'posts': posttmpls, } r = renderer.j2.get_template('archive.html').render(tmplvars) with open(target, "wt") as html: html.write(r) os.utime(target, (self.mtime, self.mtime)) def ping_websub(self): if not self.taxonomy or self.taxonomy == 'category': t = shared.config.get('site', 'websuburl') data = { 'hub.mode': 'publish', 'hub.url': "%s%s" % ( shared.config.get('site', 'url'), self.baseurl ) } logging.info("pinging %s with data %s", t, data) requests.post(t, data=data) #def renderpage(self, renderer, page): #pagination = int(shared.config.get('common', 'pagination')) #start = int((page-1) * pagination) #end = int(start + pagination) #posttmpls = [self.data[k].tmplvars for k in list(sorted( #self.data.keys(), reverse=True))[start:end]] #target = self.tpath(page) #logging.info("rendering taxonomy page %d to %s", page, target) #tmplvars = { #'taxonomy': { #'url': self.baseurl, #'name': self.name, #'slug': self.slug, #'taxonomy': self.taxonomy, #'paged': page, #'total': self.pages, #'perpage': pagination, #'lastmod': arrow.get(self.mtime).datetime #}, #'site': renderer.sitevars, #'posts': posttmpls, #} #r = renderer.j2.get_template('archive.html').render(tmplvars) #with open(target, "wt") as html: #html.write(r) #os.utime(target, (self.mtime, self.mtime)) #if 1 == page: ##target = os.path.join(self.feedp, 'index.rss') ##logging.info("rendering RSS feed to %s", target) ##r = renderer.j2.get_template('rss.html').render(tmplvars) ##with open(target, "wt") as html: ##html.write(r) ##os.utime(target, (self.mtime, self.mtime)) #target = os.path.join(self.feedp, 'index.atom') #logging.info("rendering Atom feed to %s", target) #r = renderer.j2.get_template('atom.html').render(tmplvars) #with open(target, "wt") as html: #html.write(r) #os.utime(target, (self.mtime, self.mtime)) ## --- ## this is a joke ## see http://indieweb.org/YAMLFeed ## don't do YAMLFeeds. #if 1 == page: #fm = frontmatter.loads('') #fm.metadata = { #'site': { #'author': renderer.sitevars['author'], #'url': renderer.sitevars['url'], #'title': renderer.sitevars['title'], #}, #'items': [], #} #for p in posttmpls: #fm.metadata['items'].append({ #'title': p['title'], #'url': "%s/%s/" % ( renderer.sitevars['url'], p['slug']), #'content': p['content'], #'summary': p['summary'], #'published': p['published'], #'updated': p['updated'], #}) #target = os.path.join(self.feedp, 'index.yml') #logging.info("rendering YAML feed to %s", target) #with open(target, "wt") as html: #html.write(frontmatter.dumps(fm)) #os.utime(target, (self.mtime, self.mtime)) ## --- #if 1 == page: #if not self.taxonomy or self.taxonomy == 'category': #t = shared.config.get('site', 'websuburl') #data = { #'hub.mode': 'publish', #'hub.url': "%s%s" % ( #shared.config.get('site', 'url'), self.baseurl #) #} #logging.info("pinging %s with data %s", t, data) #requests.post(t, data=data) class Content(BaseIter): def __init__(self, images, comments, extensions=['md']): super(Content, self).__init__() self.images = images self.comments = comments basepath = shared.config.get('source', 'contentdir') self.files = [] for ext in extensions: self.files += glob.glob(os.path.join(basepath, "*", "*.%s" % ext)) self.tags = {} self.categories = {} self.front = Taxonomy() self.shortslugmap = {} def populate(self): now = arrow.utcnow().timestamp for fpath in self.files: item = Singular(fpath, self.images, self.comments) self.append(item.pubtime, item) #self.shortslugmap[item.shortslug] = item.fname if item.isfuture: logging.warning("skipping future post %s", item.fname) continue if item.isonfront: self.front.append(item.pubtime, item) if item.iscategorised: if item.category not in self.categories: self.categories[item.category] = Taxonomy(item.category, 'category') self.categories[item.category].append(item.pubtime, item) for tag in item.tags: tslug = slugify(tag, only_ascii=True, lower=True) if tslug not in self.tags: self.tags[tslug] = Taxonomy(tag, 'tag', tslug) self.tags[tslug].append(item.pubtime, item) #self.symlinktag(tslug, item.path) #def symlinktag(self, tslug, fpath): #fdir, fname = os.path.split(fpath) #tagpath = os.path.join(shared.config.get('source', 'tagsdir'), tslug) #if not os.path.isdir(tagpath): #os.mkdir(tagpath) #sympath = os.path.relpath(fdir, tagpath) #dst = os.path.join(tagpath, fname) #src = os.path.join(sympath, fname) #if not os.path.islink(dst): #os.symlink(src, dst) def sitemap(self): target = os.path.join( shared.config.get('target', 'builddir'), 'sitemap.txt' ) urls = [] for item in self.data.values(): urls.append( "%s/%s/" % ( shared.config.get('site', 'url'), item.fname )) with open(target, "wt") as f: logging.info("writing sitemap to %s" % (target)) f.write("\n".join(urls)) def magicphp(self, renderer): redirects = [] gones = [] rfile = os.path.join( shared.config.get('common', 'basedir'), shared.config.get('common', 'redirects') ) if os.path.isfile(rfile): with open(rfile, newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ') for row in r: redirects.append((row[0], row[1])) for item in self.data.values(): redirects.append((item.shortslug, item.fname)) rfile = os.path.join( shared.config.get('common', 'basedir'), shared.config.get('common', 'gone') ) if os.path.isfile(rfile): with open(rfile, newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ') for row in r: gones.append(row[0]) tmplvars = { 'site': renderer.sitevars, 'redirects': redirects, 'gones': gones } r = renderer.j2.get_template("magic.php").render(tmplvars) target = os.path.abspath(os.path.join( shared.config.get('target', 'builddir'), 'magic.php' )) with open(target, "w") as html: logging.debug('writing %s', target) html.write(r) html.close() class Singular(BaseRenderable): def __init__(self, path, images = None, comments = None): logging.debug("initiating singular object from %s", path) self.path = path self.images = images or Images() self.allcomments = comments or Comments() self.category = splitpath(path)[-2] self.mtime = int(os.path.getmtime(self.path)) self.fname, self.ext = os.path.splitext(os.path.basename(self.path)) self.meta = {} self.content = '' self.photo = self.images.data.get("%s.jpg" % self.fname, None) if self.photo: self.photo.singleimage = True self.__parse() def __repr__(self): return "%s (lastmod: %s)" % (self.fname, self.published) def __parse(self): with open(self.path, mode='rt') as f: self.meta, self.content = frontmatter.parse(f.read()) #self.__filter_syndication() self.__filter_favs() self.__filter_images() #if self.isphoto: #self.content = "%s\n\n%s" % ( #self.photo, #self.content, #) #if shared.config.getboolean('params', 'nooffline'): #return #trigger = self.offlinecopies #def __filter_syndication(self): #syndications = self.meta.get('syndicate', None) #if not syndications: #return #s = "\n\n".join(['' % s for s in syndications]) #self.content = "%s\n\n%s" % ( #s, #self.content #) def __filter_favs(self): url = self.meta.get('favorite-of', self.meta.get('like-of', self.meta.get('bookmark-of', False ) ) ) if not url: return img = self.meta.get('image', False) imgs = self.meta.get('images', []) if img: imgs.append(img) if not imgs or not len(imgs): return c = '' for i in imgs: c = '%s\n[![%s](/%s/%s)](%s){.favurl}' % ( c, self.title, shared.config.get('source', 'files'), i, url ) if self.isbookmark: c = "%s\n\n%s" % (c, self.content) self.content = c def __filter_images(self): linkto = False isrepost = None if len(self.reactions.keys()): isrepost = list(self.reactions.keys())[0] if isrepost and \ len(self.reactions[isrepost]) == 1: linkto = self.reactions[isrepost][0] m = shared.MDIMGREGEX.findall(self.content) if not m: logging.debug("no images found") return for shortcode, alt, fname, title, cl in m: image = self.images.data.get(fname, None) if not image: logging.debug("%s not found in images", fname) continue if cl: image.cl = cl logging.debug( "replacing %s in content with %s", shortcode, "%s" % image ) self.content = self.content.replace( shortcode, "%s" % image ) @property def comments(self): if hasattr(self, '_comments'): return self._comments # the comments could point to both the "real" url and the shortslug # so I need to get both c = {} for by in [self.fname, self.shortslug]: c = {**c, **self.allcomments[by].data} #self._comments = [c[k].tmplvars for k in list(sorted(c.keys(), reverse=True))] self._comments = [c[k] for k in list(sorted(c.keys(), reverse=False))] return self._comments @property def replies(self): if hasattr(self, '_replies'): return self._replies self._replies = [c.tmplvars for c in self.comments if not len(c.reacji)] return self._replies @property def reacjis(self): if hasattr(self, '_reacjis'): return self._reacjis reacjis = {} for c in self.comments: rj = c.reacji if not len(rj): continue if not reacjis.get(rj, False): reacjis[rj] = [] reacjis[rj].append(c.tmplvars) self._reacjis = reacjis return self._reacjis @property def reactions(self): if hasattr(self, '_reactions'): return self._reactions # getting rid of '-' to avoid css trouble and similar convert = { 'bookmark-of': 'bookmark', 'repost-of': 'repost', 'in-reply-to': 'reply', 'favorite-of': 'fav', 'like-of': 'like', } reactions = {} for k, v in convert.items(): x = self.meta.get(k, None) if not x: continue if isinstance(x, str): x = [x] reactions[v] = x self._reactions = reactions return self._reactions @property def author(self): return dict(shared.config.items('author')) @property def license(self): if hasattr(self, '_licence'): return self._licence if 'article' == self.category: l = { 'url': 'https://creativecommons.org/licenses/by/4.0/', 'text': 'CC BY 4.0', } elif 'journal' == self.category: l = { 'url': 'https://creativecommons.org/licenses/by-nc/4.0/', 'text': 'CC BY-NC 4.0', } else: l = { 'url': 'https://creativecommons.org/licenses/by-nc-nd/4.0/', 'text': 'CC BY-NC-ND 4.0', } self._licence = l return self._licence @property def syndicate(self): return self.meta.get('syndicate', []) @property def urls(self): if hasattr(self, '_urls'): return self._urls #urls = shared.URLREGEX.findall(self.content) #urls = [*urls, *self.syndicate] urls = list(self.syndicate) for reactionurls in self.reactions.values(): urls = [*urls, *reactionurls] #r = [] #logging.debug('searching for urls for %s', self.fname) #for link in urls: #purl = urllib.parse.urlparse(link) #if purl.netloc in shared.config.get('site', 'domains'): #logging.debug('excluding url %s - %s matches %s', link, purl.netloc, shared.config.get('site', 'domains')) #continue #if link in r: #continue #r.append(link) self._urls = urls logging.debug('urls for %s: %s', self.fname, urls) return self._urls @property def lang(self): if hasattr(self, '_lang'): return self._lang lang = 'en' try: lang = langdetect.detect("\n".join([ self.title, self.content ])) except: pass self._lang = lang return self._lang @property def tags(self): return list(self.meta.get('tags', [])) @property def published(self): if hasattr(self, '_published'): return self._published self._published = arrow.get( self.meta.get('published', self.mtime) ) return self._published @property def updated(self): if hasattr(self, '_updated'): return self._updated self._updated = arrow.get( self.meta.get('updated', self.meta.get('published', self.mtime) ) ) return self._updated @property def pubtime(self): return int(self.published.timestamp) @property def isphoto(self): if not self.photo: return False return self.photo.is_photo @property def isbookmark(self): return self.meta.get('bookmark-of', False) @property def isreply(self): return self.meta.get('in-reply-to', False) @property def isfuture(self): now = arrow.utcnow().timestamp if self.pubtime > now: return True return False # TODO #@property #def isrvsp(self): # r'([^<]+)' @property def isfav(self): r = False for maybe in ['like-of', 'favorite-of']: maybe = self.meta.get(maybe, False) if maybe: r = maybe break return r @property def ispage(self): if not self.meta: return True return False @property def isonfront(self): if self.ispage: return False if self.isbookmark: return False if self.isfav: return False return True @property def iscategorised(self): if self.ispage: return False return True @property def summary(self): return self.meta.get('summary', '') @property def title(self): if hasattr(self, '_title'): return self._title self._title = '' for maybe in ['title', 'bookmark-of', 'in-reply-to', 'repost-of']: maybe = self.meta.get(maybe, False) if maybe: if isinstance(maybe, list): maybe = maybe.pop() self._title = maybe.replace('\n', ' ').replace('\r', '') break return self._title @property def url(self): return "%s/%s/" % (shared.config.get('site', 'url'), self.fname) @property def tmplfile(self): if self.ispage: return 'page.html' else: return 'singular.html' @property def html(self): if hasattr(self, '_html'): return self._html self._html = shared.Pandoc().convert(self.content) return self._html @property def sumhtml(self): if hasattr(self, '_sumhtml'): return self._sumhtml self._sumhtml = self.meta.get('summary', '') if len(self._sumhtml): self._sumhtml = shared.Pandoc().convert(self.summary) return self._sumhtml #@property #def offlinecopies(self): ## stupidly simple property caching #if hasattr(self, 'copies'): #return self.copies #copies = {} #for maybe in ['bookmark-of', 'in-reply-to', 'repost-of', 'favorite-of']: #maybe = self.meta.get(maybe, False) #if not maybe: #continue #if not isinstance(maybe, list): #maybe = [maybe] #for url in maybe: #arch = OfflineArchive(url) #arch.run() ##copies[url] = arch.read() ##self.copies = copies ##return copies @property def exif(self): if not self.isphoto: return {} return self.photo.exif #@property #def rssenclosure(self): #if not self.isphoto: #return {} #return self.photo.rssenclosure @property def tmplvars(self): if hasattr(self, '_tmplvars'): return self._tmplvars self._tmplvars = { 'title': self.title, 'published': self.published.datetime, 'tags': self.tags, 'author': self.author, 'content': self.content, 'html': self.html, 'category': self.category, 'reactions': self.reactions, 'updated': self.updated.datetime, 'summary': self.summary, 'sumhtml': self.sumhtml, 'exif': self.exif, 'lang': self.lang, 'syndicate': self.syndicate, 'slug': self.fname, 'shortslug': self.shortslug, 'comments': self.comments, 'replies': self.replies, 'reacjis': self.reacjis, 'photo': {}, 'rssenclosure': {}, 'license': self.license, } if self.isphoto: self._tmplvars.update({ 'photo': self.photo.tmplvars, 'rssenclosure': self.photo.rssenclosure }) return self._tmplvars @property def shortslug(self): if hasattr(self, '_shortslug'): return self._shortslug self._shortslug = shared.baseN(self.pubtime) return self._shortslug @property def target(self): targetdir = os.path.abspath(os.path.join( shared.config.get('target', 'builddir'), self.fname )) return os.path.join(targetdir, 'index.html') async def rendercomments(self, renderer): for comment in self.comments: await comment.render(renderer) async def render(self, renderer = None): renderer = renderer or Renderer() # this is only when I want salmentions and I want to include all of the comments as well # otherwise it affects both webmentions sending and search indexing #if len(self.comments): #lctime = self.comments[0].mtime #if lctime > self.mtime: #self.mtime = lctime #await self.rendercomments(renderer) mtime = self.mtime if len(self.comments): lctime = self.comments[0].mtime if lctime > self.mtime: mtime = lctime logging.info("rendering and saving %s", self.fname) if not shared.config.getboolean('params', 'force') and os.path.isfile(self.target): ttime = int(os.path.getmtime(self.target)) logging.debug('ttime is %d mtime is %d', ttime, mtime) if ttime == mtime: logging.debug( '%s exists and up-to-date (lastmod: %d)', self.target, ttime ) return tmplvars = { 'post': self.tmplvars, 'site': renderer.sitevars, 'taxonomy': {}, } r = renderer.j2.get_template(self.tmplfile).render(tmplvars) self.writerendered(r, mtime) async def ping(self, pinger): if self.isfuture: return logging.debug('urls in %s: %s', self.fname, self.urls) for target in self.urls: record = { 'mtime': self.mtime, 'source': self.url, 'target': target } h = json.dumps(record, sort_keys=True) h = hashlib.sha1(h.encode('utf-8')).hexdigest() if pinger.db.get(h, False): logging.debug( "%s is already pinged from %s @ %d, skipping", target, self.url, self.mtime ) continue logging.info("sending webmention from %s to %s", self.url, target) ws = WebmentionSend(self.url, target) try: ws.send(allow_redirects=True, timeout=30) except Exception as e: logging.error('ping failed to %s', target) pinger.db[h] = record class Webmentioner(object): def __init__(self): self.dbpath = os.path.abspath(os.path.join( shared.config.get('target', 'builddir'), shared.config.get('var', 'webmentions') )) if os.path.isfile(self.dbpath): with open(self.dbpath, 'rt') as f: self.db = json.loads(f.read()) else: self.db = {} def finish(self): with open(self.dbpath, 'wt') as f: f.write(json.dumps(self.db, sort_keys=True, indent=4)) class NASG(object): lockfile = os.path.join(tempfile.gettempdir(), 'nasg_%s.lock' % getpass.getuser()) def __init__(self): # --- set params parser = argparse.ArgumentParser(description='Parameters for NASG') parser.add_argument( '--clear', action='store_true', default=False, help='clear build directory in advance' ) parser.add_argument( '--regenerate', action='store_true', default=False, help='force downsizing images' ) parser.add_argument( '--force', action='store_true', default=False, help='force rendering HTML' ) parser.add_argument( '--loglevel', default='error', help='change loglevel' ) parser.add_argument( '--nooffline', action='store_true', default=False, help='skip offline copie checks' ) parser.add_argument( '--nodownsize', action='store_true', default=False, help='skip image downsizing' ) parser.add_argument( '--norender', action='store_true', default=False, help='skip rendering' ) parser.add_argument( '--refetch', action='store_true', default=False, help='force re-fetching offline archives' ) params = vars(parser.parse_args()) shared.config.add_section('params') for k, v in params.items(): shared.config.set('params', k, str(v)) # remove the rest of the potential loggers while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) # --- set loglevel logging.basicConfig( level=shared.LLEVEL[shared.config.get('params', 'loglevel')], format='%(asctime)s - %(levelname)s - %(message)s' ) async def __adownsize(self): for fname, img in self.images: await img.downsize(self.existing) async def __acrender(self): for (pubtime, singular) in self.content: await singular.render(self.renderer) async def __atrender(self): for e in [self.content.categories, self.content.tags]: for name, t in e.items(): await t.render(self.renderer) async def __afrender(self): await self.content.front.render(self.renderer) async def __aindex(self): for (pubtime, singular) in self.content: await self.searchdb.append(singular) async def __aping(self): for (pubtime, singular) in self.content: await singular.ping(self.pinger) def __atindexrender(self): m = { 'category': self.content.categories, 'tag': self.content.tags } for p, taxonomy in m.items(): target = os.path.abspath(os.path.join( shared.config.get('target', 'builddir'), p, 'index.html' )) tmplvars = { 'site': self.renderer.sitevars, 'taxonomy': {}, 'taxonomies': {} } for name, t in taxonomy.items(): logging.debug('adding %s to taxonomyindex' % name) tmplvars['taxonomies'][name] = [{'title': t.data[k].title, 'url': t.data[k].url} for k in list(sorted( t.data.keys(), reverse=True))] tmplvars['taxonomies'] = sorted(tmplvars['taxonomies'].items()) logging.debug('rendering taxonomy index to %s', target) r = self.renderer.j2.get_template('archiveindex.html').render(tmplvars) with open(target, "wt") as html: html.write(r) @property def images(self): if hasattr(self, '_images'): return self._images logging.info("discovering images") images = Images() images.populate() self._images = images return self._images @property def content(self): if hasattr(self, '_content'): return self._content logging.info("discovering content") content = Content(self.images, self.comments) content.populate() self._content = content return self._content @property def comments(self): if hasattr(self, '_comments'): return self._comments logging.info("discovering comments") comments = Comments() comments.populate() self._comments = comments return self._comments @property def existing(self): if hasattr(self, '_existing'): return self._existing existing = glob.glob(os.path.join( shared.config.get('target', 'filesdir'), "*" )) self._existing = existing return self._existing @property def renderer(self): if hasattr(self, '_renderer'): return self._renderer self._renderer = Renderer() return self._renderer @property def searchdb(self): if hasattr(self, '_searchdb'): return self._searchdb self._searchdb = Indexer() return self._searchdb @property def pinger(self): if hasattr(self, '_pinger'): return self._pinger self._pinger = Webmentioner() return self._pinger def run(self): if os.path.isfile(self.lockfile): raise ValueError( "Lockfile is present at %s; another instance is running." % ( self.lockfile ) ) else: atexit.register(os.remove, self.lockfile) with open(self.lockfile, "wt") as f: f.write(arrow.utcnow().format()) if shared.config.getboolean('params', 'clear'): input('about to clear build directory, press enter to continue') shutil.rmtree(os.path.abspath( shared.config.get('target', 'builddir') )) loop = asyncio.get_event_loop() for d in shared.config.options('target'): if 'dir' in d and not os.path.isdir(shared.config.get('target', d)): os.mkdir(shared.config.get('target', d)) if not shared.config.getboolean('params', 'nodownsize'): logging.info("downsizing images") loop.run_until_complete(self.__adownsize()) if not shared.config.getboolean('params', 'norender'): logging.info("rendering content") loop.run_until_complete(self.__acrender()) logging.info("rendering categories and tags") loop.run_until_complete(self.__atrender()) logging.info("rendering the front page elements") loop.run_until_complete(self.__afrender()) logging.info("rendering taxonomy indexes") self.__atindexrender() logging.info("rendering sitemap") self.content.sitemap() logging.info("render magic.php") self.content.magicphp(self.renderer) logging.info("copy the static bits") src = shared.config.get('source', 'staticdir') for item in os.listdir(src): s = os.path.join(src, item) d = os.path.join(shared.config.get('target', 'builddir'), item) logging.debug("copying %s to %s", s, d) shutil.copy2(s, d) logging.info("pouplating searchdb") loop.run_until_complete(self.__aindex()) self.searchdb.finish() logging.info("webmentioning urls") loop.run_until_complete(self.__aping()) self.pinger.finish() loop.close() if __name__ == '__main__': worker = NASG() worker.run()