diff --git a/envelope.py b/envelope.py new file mode 100644 index 0000000..bca89f9 --- /dev/null +++ b/envelope.py @@ -0,0 +1,193 @@ +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from email.mime.image import MIMEImage +from email.header import Header +import email.charset +from email.generator import Generator +from io import StringIO +import mimetypes +from email.mime.base import MIMEBase +from email.encoders import encode_base64 +import email.utils + +import time +import getpass +import socket +import shutil +import requests +import tempfile +import atexit +import os +import re +import smtplib +import logging +from shared import Pandoc + +class Letter(object): + def __init__(self, sender=None, recipient=None, subject='', text=''): + self.sender = sender or (getpass.getuser(), socket.gethostname()) + self.recipient = recipient or self.sender + + self.tmp = tempfile.mkdtemp( + 'envelope_', + dir=tempfile.gettempdir() + ) + atexit.register( + shutil.rmtree, + os.path.abspath(self.tmp) + ) + self.text = text; + self.subject = subject + self.images = [] + self.ready = None + self.time = time.time() + self.headers = {} + + @property + def _html(self): + return Pandoc().convert(self.text) + + @property + def _tmpl(self): + return "
%s" % (self._html) + + def __pull_image(self, img): + fname = os.path.basename(img) + i = { + 'url': img, + 'name': fname, + 'tmp': os.path.join(self.tmp, fname), + } + + logging.debug("pulling image %s", i['url']) + r = requests.get(i['url'], stream=True) + if r.status_code == 200: + with open(i['tmp'], 'wb') as f: + logging.debug("writing image %s", i['tmp']) + r.raw.decode_content = True + shutil.copyfileobj(r.raw, f) + if not isinstance(self.images, list): + self.images = [] + self.images.append(i) + + + def __pull_images(self): + mdmatch = re.compile( + r'!\[.*\]\((.*?\.(?:jpe?g|png|gif)(?:\s+[\'\"]?.*?[\'\"]?)?)\)' + r'(?:\{.*?\})?' + ) + [self.__pull_image(img) for img in mdmatch.findall(self.text)] + + + def __attach_images(self): + self.__pull_images() + for i in self.images: + cid = 'cid:%s' % (i['name']) + logging.debug("replacing %s with %s", i['url'], cid) + self.text = self.text.replace(i['url'], cid) + + + def make(self, inline_images=True): + if inline_images: + self.__attach_images() + + + # Python, by default, encodes utf-8 in base64, which makes plain text + # mail painful; this overrides and forces Quoted Printable. + # Quoted Printable is still awful, but better, and we're going to + # force the mail to be 8bit encoded. + # Note: enforcing 8bit breaks compatibility with ancient mail clients. + email.charset.add_charset('utf-8', email.charset.QP, email.charset.QP, 'utf-8') + + mail = MIMEMultipart('alternative') + + # --- setting headers --- + self.headers = { + 'Subject': Header(re.sub(r"\r?\n?$", "", self.subject, 1), 'utf-8').encode(), + 'To': email.utils.formataddr(self.recipient), + 'From': email.utils.formataddr(self.sender), + 'Date': email.utils.formatdate(self.time, localtime=True) + } + + for k, v in self.headers.items(): + mail.add_header(k, "%s" % v) + logging.debug("headers: %s", self.headers) + + # --- adding plain text --- + text = self.text + _text = MIMEText(text, 'text', _charset='utf-8') + # --- + # this is the part where we overwrite the way Python thinks: + # force the text to be the actual, unencoded, utf-8. + # Note:these steps breaks compatibility with ancient mail clients. + _text.replace_header('Content-Transfer-Encoding', '8bit') + _text.replace_header('Content-Type', 'text/plain; charset=utf-8') + _text.set_payload(self.text) + # --- + logging.debug("text: %s", _text) + mail.attach(_text) + + # --- HTML bit --- + # this is where it gets tricky: the HTML part should be a 'related' + # wrapper, in which the text and all the related images are sitting + _envelope = MIMEMultipart('related') + + + html = self._tmpl + _html = MIMEText(html, 'html', _charset='utf-8') + # --- + # see above under 'adding plain text' + _html.replace_header('Content-Transfer-Encoding', '8bit') + _html.replace_header('Content-Type', 'text/html; charset=utf-8') + _html.set_payload(html) + # --- + logging.debug("HTML: %s", _html) + _envelope.attach(_html) + + for i in self.images: + mimetype, encoding = mimetypes.guess_type(i['tmp']) + mimetype = mimetype or 'application/octet-stream' + mimetype = mimetype.split('/', 1) + attachment = MIMEBase(mimetype[0], mimetype[1]) + with open(i['tmp'], 'rb') as img: + attachment.set_payload(img.read()) + img.close() + os.unlink(i['tmp']) + + encode_base64(attachment) + attachment.add_header( + 'Content-Disposition', + 'inline', + filename=i['name'] + ) + attachment.add_header( + 'Content-ID', + '<%s>' % (i['name']) + ) + + _envelope.attach(attachment) + + # add the whole html + image pack to the mail + mail.attach(_envelope) + + str_io = StringIO() + g = Generator(str_io, False) + g.flatten(mail) + + self.ready = str_io.getvalue().encode('utf-8') + + def send(self): + if not self.ready: + logging.error('this mail is not ready') + return + + try: + s = smtplib.SMTP('127.0.0.1', 25) + # unless you do the encode, you'll get: + # File "/usr/local/lib/python3.5/smtplib.py", line 850, in sendmail + # msg = _fix_eols(msg).encode('ascii') + # UnicodeEncodeError: 'ascii' codec can't encode character '\xa0' in position 1073: ordinal not in range(128) + s.sendmail(self.headers['From'], self.headers['To'], self.ready) + s.quit() + except Exception as e: + logging.error('sending mail failed with error: %s', e) diff --git a/nasg.py b/nasg.py old mode 100644 new mode 100755 index 385f4ad..cacb737 --- a/nasg.py +++ b/nasg.py @@ -8,15 +8,15 @@ import shutil import logging import json import glob -import subprocess import tempfile import atexit import re import hashlib import math import asyncio -import magic +import csv +import magic import arrow import wand.image import similar_text @@ -27,7 +27,7 @@ import requests from breadability.readable import Article from whoosh import index import jinja2 - +import urllib.parse import shared def splitpath(path): @@ -70,13 +70,19 @@ class Indexer(object): for url, offlinecopy in singular.offlinecopies.items(): content_remote.append("%s" % offlinecopy) + weight = 1 + if singular.isbookmark: + weight = 10 + if singular.ispage: + weight = 100 + self.writer.add_document( title=singular.title, url=singular.url, content=" ".join(list(map(str,[*content_real, *content_remote]))), date=singular.published.datetime, tags=",".join(list(map(str, singular.tags))), - weight=1, + weight=weight, img="%s" % singular.photo ) @@ -190,35 +196,6 @@ class Renderer(object): return True return False - #def rendersingular(self, singular): - #logging.debug("rendering and saving %s", singular.fname) - #targetdir = os.path.abspath(os.path.join( - #shared.config.get('target', 'builddir'), - #singular.fname - #)) - #target = os.path.join(targetdir, 'index.html') - - #if not shared.config.get('params', 'force') and os.path.isfile(target): - #ttime = int(os.path.getmtime(target)) - #if ttime == singular.mtime: - #logging.debug('%s exists and up-to-date (lastmod: %d)', target, ttime) - #return - - #if not os.path.isdir(targetdir): - #os.mkdir(targetdir) - - #tmpl = self.j2.get_template(singular.tmplfile) - #tmplvars = { - #'post': singular.tmplvars, - #'site': self.sitevars, - #'taxonomy': {}, - #} - #r = tmpl.render(tmplvars) - #with open(target, "w") as html: - #html.write(r) - #html.close() - #os.utime(target, (singular.mtime, singular.mtime)) - class BaseIter(object): def __init__(self): @@ -248,97 +225,97 @@ class BaseIter(object): yield (k, v) return -class CMDLine(object): - def __init__(self, executable): - self.executable = self._which(executable) - if self.executable is None: - raise OSError('No %s found in PATH!' % executable) - return +#class CMDLine(object): + #def __init__(self, executable): + #self.executable = self._which(executable) + #if self.executable is None: + #raise OSError('No %s found in PATH!' % executable) + #return - @staticmethod - def _which(name): - for d in os.environ['PATH'].split(':'): - which = glob.glob(os.path.join(d, name), recursive=True) - if which: - return which.pop() - return None + #@staticmethod + #def _which(name): + #for d in os.environ['PATH'].split(':'): + #which = glob.glob(os.path.join(d, name), recursive=True) + #if which: + #return which.pop() + #return None - def __enter__(self): - self.process = subprocess.Popen( - [self.executable, "-stay_open", "True", "-@", "-"], - universal_newlines=True, - stdin=subprocess.PIPE, stdout=subprocess.PIPE) - return self + #def __enter__(self): + #self.process = subprocess.Popen( + #[self.executable, "-stay_open", "True", "-@", "-"], + #universal_newlines=True, + #stdin=subprocess.PIPE, stdout=subprocess.PIPE) + #return self - def __exit__(self, exc_type, exc_value, traceback): - self.process.stdin.write("-stay_open\nFalse\n") - self.process.stdin.flush() + #def __exit__(self, exc_type, exc_value, traceback): + #self.process.stdin.write("-stay_open\nFalse\n") + #self.process.stdin.flush() - def execute(self, *args): - args = args + ("-execute\n",) - self.process.stdin.write(str.join("\n", args)) - self.process.stdin.flush() - output = "" - fd = self.process.stdout.fileno() - while not output.endswith(self.sentinel): - output += os.read(fd, 4096).decode('utf-8', errors='ignore') - return output[:-len(self.sentinel)] + #def execute(self, *args): + #args = args + ("-execute\n",) + #self.process.stdin.write(str.join("\n", args)) + #self.process.stdin.flush() + #output = "" + #fd = self.process.stdout.fileno() + #while not output.endswith(self.sentinel): + #output += os.read(fd, 4096).decode('utf-8', errors='ignore') + #return output[:-len(self.sentinel)] -class Pandoc(CMDLine): - """ Handles calling external binary `exiftool` in an efficient way """ - def __init__(self, md2html=True): - super().__init__('pandoc') - if md2html: - self.i = "markdown+" + "+".join([ - 'backtick_code_blocks', - 'auto_identifiers', - 'fenced_code_attributes', - 'definition_lists', - 'grid_tables', - 'pipe_tables', - 'strikeout', - 'superscript', - 'subscript', - 'markdown_in_html_blocks', - 'shortcut_reference_links', - 'autolink_bare_uris', - 'raw_html', - 'link_attributes', - 'header_attributes', - 'footnotes', - ]) - self.o = 'html5' - else: - self.o = "markdown-" + "-".join([ - 'raw_html', - 'native_divs', - 'native_spans', - ]) - self.i = 'html' +#class Pandoc(CMDLine): + #""" Handles calling external binary `exiftool` in an efficient way """ + #def __init__(self, md2html=True): + #super().__init__('pandoc') + #if md2html: + #self.i = "markdown+" + "+".join([ + #'backtick_code_blocks', + #'auto_identifiers', + #'fenced_code_attributes', + #'definition_lists', + #'grid_tables', + #'pipe_tables', + #'strikeout', + #'superscript', + #'subscript', + #'markdown_in_html_blocks', + #'shortcut_reference_links', + #'autolink_bare_uris', + #'raw_html', + #'link_attributes', + #'header_attributes', + #'footnotes', + #]) + #self.o = 'html5' + #else: + #self.o = "markdown-" + "-".join([ + #'raw_html', + #'native_divs', + #'native_spans', + #]) + #self.i = 'html' - def convert(self, text): - cmd = ( - self.executable, - '-o-', - '--from=%s' % self.i, - '--to=%s' % self.o - ) - logging.debug('converting content with Pandoc') - p = subprocess.Popen( - cmd, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) + #def convert(self, text): + #cmd = ( + #self.executable, + #'-o-', + #'--from=%s' % self.i, + #'--to=%s' % self.o + #) + #logging.debug('converting content with Pandoc') + #p = subprocess.Popen( + #cmd, + #stdin=subprocess.PIPE, + #stdout=subprocess.PIPE, + #stderr=subprocess.PIPE, + #) - stdout, stderr = p.communicate(input=text.encode()) - if stderr: - logging.error("Error during pandoc covert:\n\t%s\n\t%s", cmd, stderr) - return stdout.decode('utf-8').strip() + #stdout, stderr = p.communicate(input=text.encode()) + #if stderr: + #logging.error("Error during pandoc covert:\n\t%s\n\t%s", cmd, stderr) + #return stdout.decode('utf-8').strip() # based on http://stackoverflow.com/a/10075210 -class ExifTool(CMDLine): +class ExifTool(shared.CMDLine): """ Handles calling external binary `exiftool` in an efficient way """ sentinel = "{ready}\n" @@ -419,6 +396,7 @@ class WebImage(object): self.alttext = '' self.sizes = [] self.fallbacksize = int(shared.config.get('common','fallbackimg', fallback='720')) + self.cl = None for size in shared.config.options('downsize'): sizeext = shared.config.get('downsize', size) @@ -453,7 +431,7 @@ class WebImage(object): ) def __str__(self): - if self.is_downsizeable: + if self.is_downsizeable and not self.cl: return '\n\n' % ( self.target, self.fallback, @@ -461,8 +439,18 @@ class WebImage(object): self.fname, self.ext ) + elif self.cl: + self.cl = self.cl.replace('.', ' ') + return '' % ( + self.fallback, + self.cl, + self.alttext, + self.fname, + self.ext + ) + else: - return '\n\n' % ( + return '' % ( self.fallback, self.alttext, self.fname, @@ -768,10 +756,15 @@ class Content(BaseIter): self.front = Taxonomy() def populate(self): + now = arrow.utcnow().timestamp for fpath in self.files: item = Singular(fpath, self.images) self.append(item.pubtime, item) + if item.pubtime > now: + logging.warning("skipping future post %s", item.fname) + continue + if item.isonfront: self.front.append(item.pubtime, item) @@ -804,7 +797,7 @@ class Content(BaseIter): 'sitemap.txt' ) urls = [] - for t, item in self.data.items(): + for item in self.data.values(): urls.append( "%s/%s/" % ( shared.config.get('site', 'url'), item.fname @@ -814,6 +807,47 @@ class Content(BaseIter): logging.info("writing sitemap to %s" % (target)) f.write("\n".join(urls)) + def magicphp(self, renderer): + redirects = [] + gones = [] + rfile = os.path.join( + shared.config.get('common', 'basedir'), + shared.config.get('common', 'redirects') + ) + if os.path.isfile(rfile): + with open(rfile, newline='') as csvfile: + r = csv.reader(csvfile, delimiter=' ') + for row in r: + redirects.append((row[0], row[1])) + for item in self.data.values(): + redirects.append((item.shortslug, item.fname)) + + rfile = os.path.join( + shared.config.get('common', 'basedir'), + shared.config.get('common', 'gone') + ) + if os.path.isfile(rfile): + with open(rfile, newline='') as csvfile: + r = csv.reader(csvfile, delimiter=' ') + for row in r: + gones.append(row[0]) + + tmplvars = { + 'redirects': redirects, + 'gones': gones + } + + r = renderer.j2.get_template("magic.php").render(tmplvars) + target = os.path.abspath(os.path.join( + shared.config.get('target', 'builddir'), + 'magic.php' + )) + + with open(target, "w") as html: + logging.debug('writing %s', target) + html.write(r) + html.close() + class Singular(object): def __init__(self, path, images): logging.debug("initiating singular object from %s", path) @@ -874,6 +908,9 @@ class Singular(object): logging.debug("%s not found in images", fname) continue + if cl: + image.cl = cl + logging.debug( "replacing %s in content with %s", shortcode, @@ -904,6 +941,24 @@ class Singular(object): return reactions + @property + def urls(self): + urls = shared.URLREGEX.findall(self.content) + + for reactionurls in self.reactions.values(): + urls = [*urls, *reactionurls] + + r = [] + for link in urls: + domain = '{uri.netloc}'.format(uri=urllib.parse.urlparse(link)) + if domain in shared.config.get('site', 'domains'): + continue + if r.get(link, False): + continue + r.append(link) + + return r + @property def lang(self): lang = 'en' @@ -976,7 +1031,7 @@ class Singular(object): maybe = self.meta.get(maybe, False) if maybe: return maybe - return self.fname + return '' @property def url(self): @@ -1091,6 +1146,7 @@ class Singular(object): 'slug': self.fname, 'shortslug': self.shortslug, 'rssenclosure': self.rssenclosure, + 'copies': self.offlinecopies, } @property @@ -1143,6 +1199,12 @@ class NASG(object): def __init__(self): # --- set params parser = argparse.ArgumentParser(description='Parameters for NASG') + parser.add_argument( + '--clear', + action='store_true', + default=False, + help='clear build directory in advance' + ) parser.add_argument( '--regenerate', action='store_true', @@ -1217,6 +1279,13 @@ class NASG(object): await searchdb.append(singular) def run(self): + + if shared.config.getboolean('params', 'clear'): + input('about to clear build directory, press enter to continue') + shutil.rmtree(os.path.abspath( + shared.config.get('target', 'builddir') + )) + loop = asyncio.get_event_loop() for d in shared.config.options('target'): @@ -1235,8 +1304,8 @@ class NASG(object): content = Content(images) content.populate() + renderer = Renderer() if not shared.config.getboolean('params', 'norender'): - renderer = Renderer() logging.info("rendering content") loop.run_until_complete(self.__acrender(content, renderer)) @@ -1249,6 +1318,9 @@ class NASG(object): logging.info("rendering sitemap") content.sitemap() + logging.info("render magic.php") + content.magicphp(renderer) + logging.info("copy the static bits") src = shared.config.get('source', 'staticdir') for item in os.listdir(src): @@ -1264,7 +1336,6 @@ class NASG(object): loop.close() - if __name__ == '__main__': worker = NASG() worker.run() diff --git a/new.py b/new.py index 0686f6b..a9c3065 100644 --- a/new.py +++ b/new.py @@ -36,7 +36,7 @@ if __name__ == '__main__': now = arrow.utcnow() parser = argparse.ArgumentParser(description='create doc and print it to stdout') parser.add_argument('--tags', '-t', help='; separated, quoted list of tags') - parser.add_argument('--date', '-d', help=' YYYY-mm-ddTHH:MM:SS+TZTZ formatted date, if not now') + parser.add_argument('--date', '-d', help=' YYYY-mm-ddTHH:MM:SS+TZ formatted date, if not now') parser.add_argument('--slug', '-s', help='slug (normally autogenerated from title or pubdate)') parser.add_argument('--title', '-l', help='title of new entry') parser.add_argument('--bookmark', '-b', help='URL to bookmark') @@ -48,7 +48,7 @@ if __name__ == '__main__': args = vars(parser.parse_args()) if not args['date']: - d = now.format("YYYY-MM-DDTHH:mm:ssZ") + d = now.format(shared.ARROWISO) args['date'] = input('Date [%s]: ' % (d)) or d if not args['title']: diff --git a/requirements.txt b/requirements.txt index 3acb881..8df2aa7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ appdirs==1.4.3 arrow==0.10.0 breadability==0.1.20 chardet==3.0.3 +decorator==4.0.11 docopt==0.6.2 httptools==0.0.9 Jinja2==2.9.6 @@ -23,6 +24,7 @@ ujson==1.35 unicode-slugify==0.1.3 Unidecode==0.4.20 uvloop==0.8.0 +validators==0.11.3 Wand==0.4.4 websockets==3.3 Whoosh==2.7.4 diff --git a/search.py b/search.py new file mode 100644 index 0000000..088395c --- /dev/null +++ b/search.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 + +import asyncio +import uvloop +import os + +from sanic import Sanic +import sanic.response +from sanic.log import log as logging +from whoosh import index +from whoosh import qparser +from whoosh import fields +from whoosh import analysis +import jinja2 +import shared + +def SearchHandler(query, tmpl): + response = sanic.response.text( + "You seem to have forgot to enter what you want to search for. Please try again.", + status=400 + ) + + if not query: + return response + + query = query.replace('+', ' AND ').replace(' -', ' NOT ') + ix = index.open_dir(os.path.abspath(os.path.join( + shared.config.get('target', 'builddir'), + shared.config.get('var', 'searchdb') + ))) + + qp = qparser.MultifieldParser( + ["title", "content", "tags"], + schema = shared.schema + ) + + q = qp.parse(query) + r = ix.searcher().search(q, sortedby="weight", limit=100) + logging.info("results for '%s': %i", query, len(r)) + results = [] + for result in r: + res = { + 'title': result['title'], + 'url': result['url'], + 'highlight': result.highlights("content"), + } + if 'img' in result: + res['img'] = result['img'] + results.append(res) + + tvars = { + 'term': query, + 'posts': results, + } + + logging.info("collected %i results to render", len(results)) + response = sanic.response.html(tmpl.render(tvars), status=200) + return response + +if __name__ == '__main__': + asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) + app = Sanic() + + + jldr = jinja2.FileSystemLoader( + searchpath=shared.config.get('source', 'templatesdir') + ) + jenv = jinja2.Environment(loader=jldr) + tmpl = jenv.get_template('searchresults.html') + + @app.route("/search") + async def search(request, methods=["GET"]): + query = request.args.get('s') + r = SearchHandler(query, tmpl) + return r + + app.run(host="127.0.0.1", port=8001, debug=True) diff --git a/shared.py b/shared.py index 0a37ea2..428069f 100644 --- a/shared.py +++ b/shared.py @@ -1,8 +1,11 @@ import configparser import os +import re +import glob +import logging +import subprocess from whoosh import fields from whoosh import analysis -import re def __expandconfig(config): """ add the dirs to the config automatically """ @@ -18,6 +21,8 @@ def __expandconfig(config): )) return config +ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ' + URLREGEX = re.compile( r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+' r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*' @@ -74,3 +79,91 @@ config = configparser.ConfigParser( ) config.read('config.ini') config = __expandconfig(config) + +class CMDLine(object): + def __init__(self, executable): + self.executable = self._which(executable) + if self.executable is None: + raise OSError('No %s found in PATH!' % executable) + return + + @staticmethod + def _which(name): + for d in os.environ['PATH'].split(':'): + which = glob.glob(os.path.join(d, name), recursive=True) + if which: + return which.pop() + return None + + def __enter__(self): + self.process = subprocess.Popen( + [self.executable, "-stay_open", "True", "-@", "-"], + universal_newlines=True, + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.process.stdin.write("-stay_open\nFalse\n") + self.process.stdin.flush() + + def execute(self, *args): + args = args + ("-execute\n",) + self.process.stdin.write(str.join("\n", args)) + self.process.stdin.flush() + output = "" + fd = self.process.stdout.fileno() + while not output.endswith(self.sentinel): + output += os.read(fd, 4096).decode('utf-8', errors='ignore') + return output[:-len(self.sentinel)] + +class Pandoc(CMDLine): + """ Handles calling external binary `exiftool` in an efficient way """ + def __init__(self, md2html=True): + super().__init__('pandoc') + if md2html: + self.i = "markdown+" + "+".join([ + 'backtick_code_blocks', + 'auto_identifiers', + 'fenced_code_attributes', + 'definition_lists', + 'grid_tables', + 'pipe_tables', + 'strikeout', + 'superscript', + 'subscript', + 'markdown_in_html_blocks', + 'shortcut_reference_links', + 'autolink_bare_uris', + 'raw_html', + 'link_attributes', + 'header_attributes', + 'footnotes', + ]) + self.o = 'html5' + else: + self.o = "markdown-" + "-".join([ + 'raw_html', + 'native_divs', + 'native_spans', + ]) + self.i = 'html' + + def convert(self, text): + cmd = ( + self.executable, + '-o-', + '--from=%s' % self.i, + '--to=%s' % self.o + ) + logging.debug('converting content with Pandoc') + p = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + stdout, stderr = p.communicate(input=text.encode()) + if stderr: + logging.error("Error during pandoc covert:\n\t%s\n\t%s", cmd, stderr) + return stdout.decode('utf-8').strip() diff --git a/webmention.py b/webmention.py new file mode 100644 index 0000000..ae37f1f --- /dev/null +++ b/webmention.py @@ -0,0 +1,193 @@ +import asyncio +import uvloop +import os +import hashlib +import json +import urllib.parse +import frontmatter +from sanic import Sanic +import sanic.response +from sanic.log import log as logging +import validators +import arrow +from webmentiontools import urlinfo +import shared +import envelope + + +class WebmentionHandler(object): + def __init__ (self, source, target): + self.source = source + self.target = target + self.now = arrow.utcnow().timestamp + logging.info("incoming webmention %s => %s", self.source, self.target) + + self.r = sanic.response.text( + "something went wrong on my side, could you please let me know at hello@petermolnar.eu ?", + status=500 + ) + + def run(self): + if not self._validate(): + return + + self._parse() + self._save() + self._notify() + + def _validate(self): + test = { + self.source: '"souce" parameter is an invalid URL', + self.target: '"target" parameter is an invalid URL' + } + for url, emsg in test.items(): + logging.debug("validating URL %s", url) + if not validators.url(url): + self.r = sanic.response.text( + emsg, + status=400 + ) + return False + + logging.debug("checking target domain") + _target = urllib.parse.urlparse(self.target) + _target_domain = '{uri.netloc}'.format(uri=_target) + _mydomains = shared.config.get('site', 'domains').split(" ") + if not _target_domain in _mydomains: + self.r = sanic.response.text( + "'target' is not in the list of allowed domains", + status=400 + ) + return False + + logging.debug("checking selfpings") + _source = urllib.parse.urlparse(self.source) + _source_domain = '{uri.netloc}'.format(uri=_source) + if _source_domain in _mydomains: + self.r = sanic.response.text( + "selfpings are not allowed", + status=400 + ) + return False + + return True + + def _parse(self): + logging.debug("fetching %s", self.source) + self._source = urlinfo.UrlInfo(self.source) + if self._source.error: + self.r = sanic.response.text( + "couldn't fetch 'source' from %s" % (self.source), + status=408 + ) + return False + + self.source = self._source.realurl + if not self._source.linksTo(self.target): + self.r = sanic.response.text( + "'source' (%s) does not link to 'target' (%s)" % ( + self.source, + self.target + ), + status=400 + ) + return False + + logging.debug("fetching %s", self.target) + self._target = urlinfo.UrlInfo(self.target) + if self._target.error: + self.r = sanic.response.text( + "couldn't fetch 'target' from %s" % (self.target), + status=408 + ) + self.target = self._target.realurl + #logging.info("parsed webmention:\n%s\n\n%s", self.meta, self.content) + + def _save(self): + doc = frontmatter.loads('') + doc.metadata = self.meta + doc.content = self.content + target = os.path.join( + shared.config.get('source', 'commentsdir'), + self.mhash + ) + if os.path.isfile(target): + logging.warning('updating existing webmention %s', target) + else: + logging.warning('saving incoming webmention to %s', target) + + with open(target, 'wt') as t: + t.write(frontmatter.dumps(doc)) + self.r = sanic.response.text( + "accepted", + status=202 + ) + + def _notify(self): + text = "# webmention\n## Source\n\nauthor\n: %s\n\nURL\n: %s\n\nemail\n: %s\n\ndate\n: %s\n\n## Target\n\nURL\n: %s\n\n---\n\n%s" % ( + self._meta['author'].get('name', self.source), + self._meta['author'].get('url', self.source), + self._meta['author'].get('email', ''), + self._meta['date'], + self.target, + self.content + ) + + l = envelope.Letter( + sender=( + shared.config.get('webmention', 'from_name'), + shared.config.get('webmention', 'from_address') + ), + recipient=( + shared.config.get('webmention', 'to_name'), + shared.config.get('webmention', 'to_address') + ), + subject="[webmention] %s" % self.source, + text=text + ) + l.make() + l.send() + + @property + def mhash(self): + return hashlib.sha1(json.dumps(self.meta, sort_keys=True).encode('utf-8')).hexdigest() + + @property + def meta(self): + if hasattr(self, '_meta'): + return self._meta + + self._meta = { + 'author': self._source.author(), + 'type': self._source.relationType(), + 'target': self.target, + 'source': self.source, + 'date': arrow.get(self._source.pubDate()).format(shared.ARROWISO), + } + return self._meta + + @property + def content(self): + if hasattr(self, '_content'): + return self._content + + # from HTML to Markdown + self._content = shared.Pandoc(False).convert(self._source.content()) + # from Markdown back to HTML + #self._content = shared.Pandoc().convert(tmpcontent) + return self._content + + +if __name__ == '__main__': + asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) + app = Sanic() + + @app.route("/webmention", methods=["POST"]) + async def wm(request): + source = request.form.get('source') + target = request.form.get('target') + r = WebmentionHandler(source, target) + r.run() + return r.r + + app.run(host="127.0.0.1", port=8002, debug=True)