working search and webmentions receiver

This commit is contained in:
Peter Molnar 2017-05-26 10:14:24 +01:00
parent 1b7b354a88
commit 558195288d
7 changed files with 752 additions and 123 deletions

193
envelope.py Normal file
View file

@ -0,0 +1,193 @@
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.image import MIMEImage
from email.header import Header
import email.charset
from email.generator import Generator
from io import StringIO
import mimetypes
from email.mime.base import MIMEBase
from email.encoders import encode_base64
import email.utils
import time
import getpass
import socket
import shutil
import requests
import tempfile
import atexit
import os
import re
import smtplib
import logging
from shared import Pandoc
class Letter(object):
def __init__(self, sender=None, recipient=None, subject='', text=''):
self.sender = sender or (getpass.getuser(), socket.gethostname())
self.recipient = recipient or self.sender
self.tmp = tempfile.mkdtemp(
'envelope_',
dir=tempfile.gettempdir()
)
atexit.register(
shutil.rmtree,
os.path.abspath(self.tmp)
)
self.text = text;
self.subject = subject
self.images = []
self.ready = None
self.time = time.time()
self.headers = {}
@property
def _html(self):
return Pandoc().convert(self.text)
@property
def _tmpl(self):
return "<html><head></head><body>%s</body></html>" % (self._html)
def __pull_image(self, img):
fname = os.path.basename(img)
i = {
'url': img,
'name': fname,
'tmp': os.path.join(self.tmp, fname),
}
logging.debug("pulling image %s", i['url'])
r = requests.get(i['url'], stream=True)
if r.status_code == 200:
with open(i['tmp'], 'wb') as f:
logging.debug("writing image %s", i['tmp'])
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
if not isinstance(self.images, list):
self.images = []
self.images.append(i)
def __pull_images(self):
mdmatch = re.compile(
r'!\[.*\]\((.*?\.(?:jpe?g|png|gif)(?:\s+[\'\"]?.*?[\'\"]?)?)\)'
r'(?:\{.*?\})?'
)
[self.__pull_image(img) for img in mdmatch.findall(self.text)]
def __attach_images(self):
self.__pull_images()
for i in self.images:
cid = 'cid:%s' % (i['name'])
logging.debug("replacing %s with %s", i['url'], cid)
self.text = self.text.replace(i['url'], cid)
def make(self, inline_images=True):
if inline_images:
self.__attach_images()
# Python, by default, encodes utf-8 in base64, which makes plain text
# mail painful; this overrides and forces Quoted Printable.
# Quoted Printable is still awful, but better, and we're going to
# force the mail to be 8bit encoded.
# Note: enforcing 8bit breaks compatibility with ancient mail clients.
email.charset.add_charset('utf-8', email.charset.QP, email.charset.QP, 'utf-8')
mail = MIMEMultipart('alternative')
# --- setting headers ---
self.headers = {
'Subject': Header(re.sub(r"\r?\n?$", "", self.subject, 1), 'utf-8').encode(),
'To': email.utils.formataddr(self.recipient),
'From': email.utils.formataddr(self.sender),
'Date': email.utils.formatdate(self.time, localtime=True)
}
for k, v in self.headers.items():
mail.add_header(k, "%s" % v)
logging.debug("headers: %s", self.headers)
# --- adding plain text ---
text = self.text
_text = MIMEText(text, 'text', _charset='utf-8')
# ---
# this is the part where we overwrite the way Python thinks:
# force the text to be the actual, unencoded, utf-8.
# Note:these steps breaks compatibility with ancient mail clients.
_text.replace_header('Content-Transfer-Encoding', '8bit')
_text.replace_header('Content-Type', 'text/plain; charset=utf-8')
_text.set_payload(self.text)
# ---
logging.debug("text: %s", _text)
mail.attach(_text)
# --- HTML bit ---
# this is where it gets tricky: the HTML part should be a 'related'
# wrapper, in which the text and all the related images are sitting
_envelope = MIMEMultipart('related')
html = self._tmpl
_html = MIMEText(html, 'html', _charset='utf-8')
# ---
# see above under 'adding plain text'
_html.replace_header('Content-Transfer-Encoding', '8bit')
_html.replace_header('Content-Type', 'text/html; charset=utf-8')
_html.set_payload(html)
# ---
logging.debug("HTML: %s", _html)
_envelope.attach(_html)
for i in self.images:
mimetype, encoding = mimetypes.guess_type(i['tmp'])
mimetype = mimetype or 'application/octet-stream'
mimetype = mimetype.split('/', 1)
attachment = MIMEBase(mimetype[0], mimetype[1])
with open(i['tmp'], 'rb') as img:
attachment.set_payload(img.read())
img.close()
os.unlink(i['tmp'])
encode_base64(attachment)
attachment.add_header(
'Content-Disposition',
'inline',
filename=i['name']
)
attachment.add_header(
'Content-ID',
'<%s>' % (i['name'])
)
_envelope.attach(attachment)
# add the whole html + image pack to the mail
mail.attach(_envelope)
str_io = StringIO()
g = Generator(str_io, False)
g.flatten(mail)
self.ready = str_io.getvalue().encode('utf-8')
def send(self):
if not self.ready:
logging.error('this mail is not ready')
return
try:
s = smtplib.SMTP('127.0.0.1', 25)
# unless you do the encode, you'll get:
# File "/usr/local/lib/python3.5/smtplib.py", line 850, in sendmail
# msg = _fix_eols(msg).encode('ascii')
# UnicodeEncodeError: 'ascii' codec can't encode character '\xa0' in position 1073: ordinal not in range(128)
s.sendmail(self.headers['From'], self.headers['To'], self.ready)
s.quit()
except Exception as e:
logging.error('sending mail failed with error: %s', e)

311
nasg.py Normal file → Executable file
View file

@ -8,15 +8,15 @@ import shutil
import logging
import json
import glob
import subprocess
import tempfile
import atexit
import re
import hashlib
import math
import asyncio
import magic
import csv
import magic
import arrow
import wand.image
import similar_text
@ -27,7 +27,7 @@ import requests
from breadability.readable import Article
from whoosh import index
import jinja2
import urllib.parse
import shared
def splitpath(path):
@ -70,13 +70,19 @@ class Indexer(object):
for url, offlinecopy in singular.offlinecopies.items():
content_remote.append("%s" % offlinecopy)
weight = 1
if singular.isbookmark:
weight = 10
if singular.ispage:
weight = 100
self.writer.add_document(
title=singular.title,
url=singular.url,
content=" ".join(list(map(str,[*content_real, *content_remote]))),
date=singular.published.datetime,
tags=",".join(list(map(str, singular.tags))),
weight=1,
weight=weight,
img="%s" % singular.photo
)
@ -190,35 +196,6 @@ class Renderer(object):
return True
return False
#def rendersingular(self, singular):
#logging.debug("rendering and saving %s", singular.fname)
#targetdir = os.path.abspath(os.path.join(
#shared.config.get('target', 'builddir'),
#singular.fname
#))
#target = os.path.join(targetdir, 'index.html')
#if not shared.config.get('params', 'force') and os.path.isfile(target):
#ttime = int(os.path.getmtime(target))
#if ttime == singular.mtime:
#logging.debug('%s exists and up-to-date (lastmod: %d)', target, ttime)
#return
#if not os.path.isdir(targetdir):
#os.mkdir(targetdir)
#tmpl = self.j2.get_template(singular.tmplfile)
#tmplvars = {
#'post': singular.tmplvars,
#'site': self.sitevars,
#'taxonomy': {},
#}
#r = tmpl.render(tmplvars)
#with open(target, "w") as html:
#html.write(r)
#html.close()
#os.utime(target, (singular.mtime, singular.mtime))
class BaseIter(object):
def __init__(self):
@ -248,97 +225,97 @@ class BaseIter(object):
yield (k, v)
return
class CMDLine(object):
def __init__(self, executable):
self.executable = self._which(executable)
if self.executable is None:
raise OSError('No %s found in PATH!' % executable)
return
#class CMDLine(object):
#def __init__(self, executable):
#self.executable = self._which(executable)
#if self.executable is None:
#raise OSError('No %s found in PATH!' % executable)
#return
@staticmethod
def _which(name):
for d in os.environ['PATH'].split(':'):
which = glob.glob(os.path.join(d, name), recursive=True)
if which:
return which.pop()
return None
#@staticmethod
#def _which(name):
#for d in os.environ['PATH'].split(':'):
#which = glob.glob(os.path.join(d, name), recursive=True)
#if which:
#return which.pop()
#return None
def __enter__(self):
self.process = subprocess.Popen(
[self.executable, "-stay_open", "True", "-@", "-"],
universal_newlines=True,
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
return self
#def __enter__(self):
#self.process = subprocess.Popen(
#[self.executable, "-stay_open", "True", "-@", "-"],
#universal_newlines=True,
#stdin=subprocess.PIPE, stdout=subprocess.PIPE)
#return self
def __exit__(self, exc_type, exc_value, traceback):
self.process.stdin.write("-stay_open\nFalse\n")
self.process.stdin.flush()
#def __exit__(self, exc_type, exc_value, traceback):
#self.process.stdin.write("-stay_open\nFalse\n")
#self.process.stdin.flush()
def execute(self, *args):
args = args + ("-execute\n",)
self.process.stdin.write(str.join("\n", args))
self.process.stdin.flush()
output = ""
fd = self.process.stdout.fileno()
while not output.endswith(self.sentinel):
output += os.read(fd, 4096).decode('utf-8', errors='ignore')
return output[:-len(self.sentinel)]
#def execute(self, *args):
#args = args + ("-execute\n",)
#self.process.stdin.write(str.join("\n", args))
#self.process.stdin.flush()
#output = ""
#fd = self.process.stdout.fileno()
#while not output.endswith(self.sentinel):
#output += os.read(fd, 4096).decode('utf-8', errors='ignore')
#return output[:-len(self.sentinel)]
class Pandoc(CMDLine):
""" Handles calling external binary `exiftool` in an efficient way """
def __init__(self, md2html=True):
super().__init__('pandoc')
if md2html:
self.i = "markdown+" + "+".join([
'backtick_code_blocks',
'auto_identifiers',
'fenced_code_attributes',
'definition_lists',
'grid_tables',
'pipe_tables',
'strikeout',
'superscript',
'subscript',
'markdown_in_html_blocks',
'shortcut_reference_links',
'autolink_bare_uris',
'raw_html',
'link_attributes',
'header_attributes',
'footnotes',
])
self.o = 'html5'
else:
self.o = "markdown-" + "-".join([
'raw_html',
'native_divs',
'native_spans',
])
self.i = 'html'
#class Pandoc(CMDLine):
#""" Handles calling external binary `exiftool` in an efficient way """
#def __init__(self, md2html=True):
#super().__init__('pandoc')
#if md2html:
#self.i = "markdown+" + "+".join([
#'backtick_code_blocks',
#'auto_identifiers',
#'fenced_code_attributes',
#'definition_lists',
#'grid_tables',
#'pipe_tables',
#'strikeout',
#'superscript',
#'subscript',
#'markdown_in_html_blocks',
#'shortcut_reference_links',
#'autolink_bare_uris',
#'raw_html',
#'link_attributes',
#'header_attributes',
#'footnotes',
#])
#self.o = 'html5'
#else:
#self.o = "markdown-" + "-".join([
#'raw_html',
#'native_divs',
#'native_spans',
#])
#self.i = 'html'
def convert(self, text):
cmd = (
self.executable,
'-o-',
'--from=%s' % self.i,
'--to=%s' % self.o
)
logging.debug('converting content with Pandoc')
p = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
#def convert(self, text):
#cmd = (
#self.executable,
#'-o-',
#'--from=%s' % self.i,
#'--to=%s' % self.o
#)
#logging.debug('converting content with Pandoc')
#p = subprocess.Popen(
#cmd,
#stdin=subprocess.PIPE,
#stdout=subprocess.PIPE,
#stderr=subprocess.PIPE,
#)
stdout, stderr = p.communicate(input=text.encode())
if stderr:
logging.error("Error during pandoc covert:\n\t%s\n\t%s", cmd, stderr)
return stdout.decode('utf-8').strip()
#stdout, stderr = p.communicate(input=text.encode())
#if stderr:
#logging.error("Error during pandoc covert:\n\t%s\n\t%s", cmd, stderr)
#return stdout.decode('utf-8').strip()
# based on http://stackoverflow.com/a/10075210
class ExifTool(CMDLine):
class ExifTool(shared.CMDLine):
""" Handles calling external binary `exiftool` in an efficient way """
sentinel = "{ready}\n"
@ -419,6 +396,7 @@ class WebImage(object):
self.alttext = ''
self.sizes = []
self.fallbacksize = int(shared.config.get('common','fallbackimg', fallback='720'))
self.cl = None
for size in shared.config.options('downsize'):
sizeext = shared.config.get('downsize', size)
@ -453,7 +431,7 @@ class WebImage(object):
)
def __str__(self):
if self.is_downsizeable:
if self.is_downsizeable and not self.cl:
return '\n<figure class="photo"><a target="_blank" class="adaptive" href="%s"><img src="%s" class="adaptimg" alt="%s" /></a><figcaption class=\"caption\">%s%s</figcaption></figure>\n' % (
self.target,
self.fallback,
@ -461,8 +439,18 @@ class WebImage(object):
self.fname,
self.ext
)
elif self.cl:
self.cl = self.cl.replace('.', ' ')
return '<img src="%s" class="%s" alt="%s" title="%s%s" />' % (
self.fallback,
self.cl,
self.alttext,
self.fname,
self.ext
)
else:
return '\n<figure class="picture"><img src="%s" class="aligncenter" alt="%s" /><figcaption class=\"caption\">%s%s</figcaption></figure>\n' % (
return '<img src="%s" class="aligncenter" alt="%s" title="%s%s" />' % (
self.fallback,
self.alttext,
self.fname,
@ -768,10 +756,15 @@ class Content(BaseIter):
self.front = Taxonomy()
def populate(self):
now = arrow.utcnow().timestamp
for fpath in self.files:
item = Singular(fpath, self.images)
self.append(item.pubtime, item)
if item.pubtime > now:
logging.warning("skipping future post %s", item.fname)
continue
if item.isonfront:
self.front.append(item.pubtime, item)
@ -804,7 +797,7 @@ class Content(BaseIter):
'sitemap.txt'
)
urls = []
for t, item in self.data.items():
for item in self.data.values():
urls.append( "%s/%s/" % (
shared.config.get('site', 'url'),
item.fname
@ -814,6 +807,47 @@ class Content(BaseIter):
logging.info("writing sitemap to %s" % (target))
f.write("\n".join(urls))
def magicphp(self, renderer):
redirects = []
gones = []
rfile = os.path.join(
shared.config.get('common', 'basedir'),
shared.config.get('common', 'redirects')
)
if os.path.isfile(rfile):
with open(rfile, newline='') as csvfile:
r = csv.reader(csvfile, delimiter=' ')
for row in r:
redirects.append((row[0], row[1]))
for item in self.data.values():
redirects.append((item.shortslug, item.fname))
rfile = os.path.join(
shared.config.get('common', 'basedir'),
shared.config.get('common', 'gone')
)
if os.path.isfile(rfile):
with open(rfile, newline='') as csvfile:
r = csv.reader(csvfile, delimiter=' ')
for row in r:
gones.append(row[0])
tmplvars = {
'redirects': redirects,
'gones': gones
}
r = renderer.j2.get_template("magic.php").render(tmplvars)
target = os.path.abspath(os.path.join(
shared.config.get('target', 'builddir'),
'magic.php'
))
with open(target, "w") as html:
logging.debug('writing %s', target)
html.write(r)
html.close()
class Singular(object):
def __init__(self, path, images):
logging.debug("initiating singular object from %s", path)
@ -874,6 +908,9 @@ class Singular(object):
logging.debug("%s not found in images", fname)
continue
if cl:
image.cl = cl
logging.debug(
"replacing %s in content with %s",
shortcode,
@ -904,6 +941,24 @@ class Singular(object):
return reactions
@property
def urls(self):
urls = shared.URLREGEX.findall(self.content)
for reactionurls in self.reactions.values():
urls = [*urls, *reactionurls]
r = []
for link in urls:
domain = '{uri.netloc}'.format(uri=urllib.parse.urlparse(link))
if domain in shared.config.get('site', 'domains'):
continue
if r.get(link, False):
continue
r.append(link)
return r
@property
def lang(self):
lang = 'en'
@ -976,7 +1031,7 @@ class Singular(object):
maybe = self.meta.get(maybe, False)
if maybe:
return maybe
return self.fname
return ''
@property
def url(self):
@ -1091,6 +1146,7 @@ class Singular(object):
'slug': self.fname,
'shortslug': self.shortslug,
'rssenclosure': self.rssenclosure,
'copies': self.offlinecopies,
}
@property
@ -1143,6 +1199,12 @@ class NASG(object):
def __init__(self):
# --- set params
parser = argparse.ArgumentParser(description='Parameters for NASG')
parser.add_argument(
'--clear',
action='store_true',
default=False,
help='clear build directory in advance'
)
parser.add_argument(
'--regenerate',
action='store_true',
@ -1217,6 +1279,13 @@ class NASG(object):
await searchdb.append(singular)
def run(self):
if shared.config.getboolean('params', 'clear'):
input('about to clear build directory, press enter to continue')
shutil.rmtree(os.path.abspath(
shared.config.get('target', 'builddir')
))
loop = asyncio.get_event_loop()
for d in shared.config.options('target'):
@ -1235,8 +1304,8 @@ class NASG(object):
content = Content(images)
content.populate()
renderer = Renderer()
if not shared.config.getboolean('params', 'norender'):
renderer = Renderer()
logging.info("rendering content")
loop.run_until_complete(self.__acrender(content, renderer))
@ -1249,6 +1318,9 @@ class NASG(object):
logging.info("rendering sitemap")
content.sitemap()
logging.info("render magic.php")
content.magicphp(renderer)
logging.info("copy the static bits")
src = shared.config.get('source', 'staticdir')
for item in os.listdir(src):
@ -1264,7 +1336,6 @@ class NASG(object):
loop.close()
if __name__ == '__main__':
worker = NASG()
worker.run()

4
new.py
View file

@ -36,7 +36,7 @@ if __name__ == '__main__':
now = arrow.utcnow()
parser = argparse.ArgumentParser(description='create doc and print it to stdout')
parser.add_argument('--tags', '-t', help='; separated, quoted list of tags')
parser.add_argument('--date', '-d', help=' YYYY-mm-ddTHH:MM:SS+TZTZ formatted date, if not now')
parser.add_argument('--date', '-d', help=' YYYY-mm-ddTHH:MM:SS+TZ formatted date, if not now')
parser.add_argument('--slug', '-s', help='slug (normally autogenerated from title or pubdate)')
parser.add_argument('--title', '-l', help='title of new entry')
parser.add_argument('--bookmark', '-b', help='URL to bookmark')
@ -48,7 +48,7 @@ if __name__ == '__main__':
args = vars(parser.parse_args())
if not args['date']:
d = now.format("YYYY-MM-DDTHH:mm:ssZ")
d = now.format(shared.ARROWISO)
args['date'] = input('Date [%s]: ' % (d)) or d
if not args['title']:

View file

@ -3,6 +3,7 @@ appdirs==1.4.3
arrow==0.10.0
breadability==0.1.20
chardet==3.0.3
decorator==4.0.11
docopt==0.6.2
httptools==0.0.9
Jinja2==2.9.6
@ -23,6 +24,7 @@ ujson==1.35
unicode-slugify==0.1.3
Unidecode==0.4.20
uvloop==0.8.0
validators==0.11.3
Wand==0.4.4
websockets==3.3
Whoosh==2.7.4

77
search.py Normal file
View file

@ -0,0 +1,77 @@
#!/usr/bin/env python3
import asyncio
import uvloop
import os
from sanic import Sanic
import sanic.response
from sanic.log import log as logging
from whoosh import index
from whoosh import qparser
from whoosh import fields
from whoosh import analysis
import jinja2
import shared
def SearchHandler(query, tmpl):
response = sanic.response.text(
"You seem to have forgot to enter what you want to search for. Please try again.",
status=400
)
if not query:
return response
query = query.replace('+', ' AND ').replace(' -', ' NOT ')
ix = index.open_dir(os.path.abspath(os.path.join(
shared.config.get('target', 'builddir'),
shared.config.get('var', 'searchdb')
)))
qp = qparser.MultifieldParser(
["title", "content", "tags"],
schema = shared.schema
)
q = qp.parse(query)
r = ix.searcher().search(q, sortedby="weight", limit=100)
logging.info("results for '%s': %i", query, len(r))
results = []
for result in r:
res = {
'title': result['title'],
'url': result['url'],
'highlight': result.highlights("content"),
}
if 'img' in result:
res['img'] = result['img']
results.append(res)
tvars = {
'term': query,
'posts': results,
}
logging.info("collected %i results to render", len(results))
response = sanic.response.html(tmpl.render(tvars), status=200)
return response
if __name__ == '__main__':
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
app = Sanic()
jldr = jinja2.FileSystemLoader(
searchpath=shared.config.get('source', 'templatesdir')
)
jenv = jinja2.Environment(loader=jldr)
tmpl = jenv.get_template('searchresults.html')
@app.route("/search")
async def search(request, methods=["GET"]):
query = request.args.get('s')
r = SearchHandler(query, tmpl)
return r
app.run(host="127.0.0.1", port=8001, debug=True)

View file

@ -1,8 +1,11 @@
import configparser
import os
import re
import glob
import logging
import subprocess
from whoosh import fields
from whoosh import analysis
import re
def __expandconfig(config):
""" add the dirs to the config automatically """
@ -18,6 +21,8 @@ def __expandconfig(config):
))
return config
ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ'
URLREGEX = re.compile(
r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+'
r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*'
@ -74,3 +79,91 @@ config = configparser.ConfigParser(
)
config.read('config.ini')
config = __expandconfig(config)
class CMDLine(object):
def __init__(self, executable):
self.executable = self._which(executable)
if self.executable is None:
raise OSError('No %s found in PATH!' % executable)
return
@staticmethod
def _which(name):
for d in os.environ['PATH'].split(':'):
which = glob.glob(os.path.join(d, name), recursive=True)
if which:
return which.pop()
return None
def __enter__(self):
self.process = subprocess.Popen(
[self.executable, "-stay_open", "True", "-@", "-"],
universal_newlines=True,
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
return self
def __exit__(self, exc_type, exc_value, traceback):
self.process.stdin.write("-stay_open\nFalse\n")
self.process.stdin.flush()
def execute(self, *args):
args = args + ("-execute\n",)
self.process.stdin.write(str.join("\n", args))
self.process.stdin.flush()
output = ""
fd = self.process.stdout.fileno()
while not output.endswith(self.sentinel):
output += os.read(fd, 4096).decode('utf-8', errors='ignore')
return output[:-len(self.sentinel)]
class Pandoc(CMDLine):
""" Handles calling external binary `exiftool` in an efficient way """
def __init__(self, md2html=True):
super().__init__('pandoc')
if md2html:
self.i = "markdown+" + "+".join([
'backtick_code_blocks',
'auto_identifiers',
'fenced_code_attributes',
'definition_lists',
'grid_tables',
'pipe_tables',
'strikeout',
'superscript',
'subscript',
'markdown_in_html_blocks',
'shortcut_reference_links',
'autolink_bare_uris',
'raw_html',
'link_attributes',
'header_attributes',
'footnotes',
])
self.o = 'html5'
else:
self.o = "markdown-" + "-".join([
'raw_html',
'native_divs',
'native_spans',
])
self.i = 'html'
def convert(self, text):
cmd = (
self.executable,
'-o-',
'--from=%s' % self.i,
'--to=%s' % self.o
)
logging.debug('converting content with Pandoc')
p = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = p.communicate(input=text.encode())
if stderr:
logging.error("Error during pandoc covert:\n\t%s\n\t%s", cmd, stderr)
return stdout.decode('utf-8').strip()

193
webmention.py Normal file
View file

@ -0,0 +1,193 @@
import asyncio
import uvloop
import os
import hashlib
import json
import urllib.parse
import frontmatter
from sanic import Sanic
import sanic.response
from sanic.log import log as logging
import validators
import arrow
from webmentiontools import urlinfo
import shared
import envelope
class WebmentionHandler(object):
def __init__ (self, source, target):
self.source = source
self.target = target
self.now = arrow.utcnow().timestamp
logging.info("incoming webmention %s => %s", self.source, self.target)
self.r = sanic.response.text(
"something went wrong on my side, could you please let me know at hello@petermolnar.eu ?",
status=500
)
def run(self):
if not self._validate():
return
self._parse()
self._save()
self._notify()
def _validate(self):
test = {
self.source: '"souce" parameter is an invalid URL',
self.target: '"target" parameter is an invalid URL'
}
for url, emsg in test.items():
logging.debug("validating URL %s", url)
if not validators.url(url):
self.r = sanic.response.text(
emsg,
status=400
)
return False
logging.debug("checking target domain")
_target = urllib.parse.urlparse(self.target)
_target_domain = '{uri.netloc}'.format(uri=_target)
_mydomains = shared.config.get('site', 'domains').split(" ")
if not _target_domain in _mydomains:
self.r = sanic.response.text(
"'target' is not in the list of allowed domains",
status=400
)
return False
logging.debug("checking selfpings")
_source = urllib.parse.urlparse(self.source)
_source_domain = '{uri.netloc}'.format(uri=_source)
if _source_domain in _mydomains:
self.r = sanic.response.text(
"selfpings are not allowed",
status=400
)
return False
return True
def _parse(self):
logging.debug("fetching %s", self.source)
self._source = urlinfo.UrlInfo(self.source)
if self._source.error:
self.r = sanic.response.text(
"couldn't fetch 'source' from %s" % (self.source),
status=408
)
return False
self.source = self._source.realurl
if not self._source.linksTo(self.target):
self.r = sanic.response.text(
"'source' (%s) does not link to 'target' (%s)" % (
self.source,
self.target
),
status=400
)
return False
logging.debug("fetching %s", self.target)
self._target = urlinfo.UrlInfo(self.target)
if self._target.error:
self.r = sanic.response.text(
"couldn't fetch 'target' from %s" % (self.target),
status=408
)
self.target = self._target.realurl
#logging.info("parsed webmention:\n%s\n\n%s", self.meta, self.content)
def _save(self):
doc = frontmatter.loads('')
doc.metadata = self.meta
doc.content = self.content
target = os.path.join(
shared.config.get('source', 'commentsdir'),
self.mhash
)
if os.path.isfile(target):
logging.warning('updating existing webmention %s', target)
else:
logging.warning('saving incoming webmention to %s', target)
with open(target, 'wt') as t:
t.write(frontmatter.dumps(doc))
self.r = sanic.response.text(
"accepted",
status=202
)
def _notify(self):
text = "# webmention\n## Source\n\nauthor\n: %s\n\nURL\n: %s\n\nemail\n: %s\n\ndate\n: %s\n\n## Target\n\nURL\n: %s\n\n---\n\n%s" % (
self._meta['author'].get('name', self.source),
self._meta['author'].get('url', self.source),
self._meta['author'].get('email', ''),
self._meta['date'],
self.target,
self.content
)
l = envelope.Letter(
sender=(
shared.config.get('webmention', 'from_name'),
shared.config.get('webmention', 'from_address')
),
recipient=(
shared.config.get('webmention', 'to_name'),
shared.config.get('webmention', 'to_address')
),
subject="[webmention] %s" % self.source,
text=text
)
l.make()
l.send()
@property
def mhash(self):
return hashlib.sha1(json.dumps(self.meta, sort_keys=True).encode('utf-8')).hexdigest()
@property
def meta(self):
if hasattr(self, '_meta'):
return self._meta
self._meta = {
'author': self._source.author(),
'type': self._source.relationType(),
'target': self.target,
'source': self.source,
'date': arrow.get(self._source.pubDate()).format(shared.ARROWISO),
}
return self._meta
@property
def content(self):
if hasattr(self, '_content'):
return self._content
# from HTML to Markdown
self._content = shared.Pandoc(False).convert(self._source.content())
# from Markdown back to HTML
#self._content = shared.Pandoc().convert(tmpcontent)
return self._content
if __name__ == '__main__':
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
app = Sanic()
@app.route("/webmention", methods=["POST"])
async def wm(request):
source = request.form.get('source')
target = request.form.get('target')
r = WebmentionHandler(source, target)
r.run()
return r.r
app.run(host="127.0.0.1", port=8002, debug=True)