working search and webmentions receiver

This commit is contained in:
Peter Molnar 2017-05-26 10:14:24 +01:00
parent 1b7b354a88
commit 558195288d
7 changed files with 752 additions and 123 deletions

193
envelope.py Normal file
View file

@ -0,0 +1,193 @@
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.image import MIMEImage
from email.header import Header
import email.charset
from email.generator import Generator
from io import StringIO
import mimetypes
from email.mime.base import MIMEBase
from email.encoders import encode_base64
import email.utils
import time
import getpass
import socket
import shutil
import requests
import tempfile
import atexit
import os
import re
import smtplib
import logging
from shared import Pandoc
class Letter(object):
def __init__(self, sender=None, recipient=None, subject='', text=''):
self.sender = sender or (getpass.getuser(), socket.gethostname())
self.recipient = recipient or self.sender
self.tmp = tempfile.mkdtemp(
'envelope_',
dir=tempfile.gettempdir()
)
atexit.register(
shutil.rmtree,
os.path.abspath(self.tmp)
)
self.text = text;
self.subject = subject
self.images = []
self.ready = None
self.time = time.time()
self.headers = {}
@property
def _html(self):
return Pandoc().convert(self.text)
@property
def _tmpl(self):
return "<html><head></head><body>%s</body></html>" % (self._html)
def __pull_image(self, img):
fname = os.path.basename(img)
i = {
'url': img,
'name': fname,
'tmp': os.path.join(self.tmp, fname),
}
logging.debug("pulling image %s", i['url'])
r = requests.get(i['url'], stream=True)
if r.status_code == 200:
with open(i['tmp'], 'wb') as f:
logging.debug("writing image %s", i['tmp'])
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
if not isinstance(self.images, list):
self.images = []
self.images.append(i)
def __pull_images(self):
mdmatch = re.compile(
r'!\[.*\]\((.*?\.(?:jpe?g|png|gif)(?:\s+[\'\"]?.*?[\'\"]?)?)\)'
r'(?:\{.*?\})?'
)
[self.__pull_image(img) for img in mdmatch.findall(self.text)]
def __attach_images(self):
self.__pull_images()
for i in self.images:
cid = 'cid:%s' % (i['name'])
logging.debug("replacing %s with %s", i['url'], cid)
self.text = self.text.replace(i['url'], cid)
def make(self, inline_images=True):
if inline_images:
self.__attach_images()
# Python, by default, encodes utf-8 in base64, which makes plain text
# mail painful; this overrides and forces Quoted Printable.
# Quoted Printable is still awful, but better, and we're going to
# force the mail to be 8bit encoded.
# Note: enforcing 8bit breaks compatibility with ancient mail clients.
email.charset.add_charset('utf-8', email.charset.QP, email.charset.QP, 'utf-8')
mail = MIMEMultipart('alternative')
# --- setting headers ---
self.headers = {
'Subject': Header(re.sub(r"\r?\n?$", "", self.subject, 1), 'utf-8').encode(),
'To': email.utils.formataddr(self.recipient),
'From': email.utils.formataddr(self.sender),
'Date': email.utils.formatdate(self.time, localtime=True)
}
for k, v in self.headers.items():
mail.add_header(k, "%s" % v)
logging.debug("headers: %s", self.headers)
# --- adding plain text ---
text = self.text
_text = MIMEText(text, 'text', _charset='utf-8')
# ---
# this is the part where we overwrite the way Python thinks:
# force the text to be the actual, unencoded, utf-8.
# Note:these steps breaks compatibility with ancient mail clients.
_text.replace_header('Content-Transfer-Encoding', '8bit')
_text.replace_header('Content-Type', 'text/plain; charset=utf-8')
_text.set_payload(self.text)
# ---
logging.debug("text: %s", _text)
mail.attach(_text)
# --- HTML bit ---
# this is where it gets tricky: the HTML part should be a 'related'
# wrapper, in which the text and all the related images are sitting
_envelope = MIMEMultipart('related')
html = self._tmpl
_html = MIMEText(html, 'html', _charset='utf-8')
# ---
# see above under 'adding plain text'
_html.replace_header('Content-Transfer-Encoding', '8bit')
_html.replace_header('Content-Type', 'text/html; charset=utf-8')
_html.set_payload(html)
# ---
logging.debug("HTML: %s", _html)
_envelope.attach(_html)
for i in self.images:
mimetype, encoding = mimetypes.guess_type(i['tmp'])
mimetype = mimetype or 'application/octet-stream'
mimetype = mimetype.split('/', 1)
attachment = MIMEBase(mimetype[0], mimetype[1])
with open(i['tmp'], 'rb') as img:
attachment.set_payload(img.read())
img.close()
os.unlink(i['tmp'])
encode_base64(attachment)
attachment.add_header(
'Content-Disposition',
'inline',
filename=i['name']
)
attachment.add_header(
'Content-ID',
'<%s>' % (i['name'])
)
_envelope.attach(attachment)
# add the whole html + image pack to the mail
mail.attach(_envelope)
str_io = StringIO()
g = Generator(str_io, False)
g.flatten(mail)
self.ready = str_io.getvalue().encode('utf-8')
def send(self):
if not self.ready:
logging.error('this mail is not ready')
return
try:
s = smtplib.SMTP('127.0.0.1', 25)
# unless you do the encode, you'll get:
# File "/usr/local/lib/python3.5/smtplib.py", line 850, in sendmail
# msg = _fix_eols(msg).encode('ascii')
# UnicodeEncodeError: 'ascii' codec can't encode character '\xa0' in position 1073: ordinal not in range(128)
s.sendmail(self.headers['From'], self.headers['To'], self.ready)
s.quit()
except Exception as e:
logging.error('sending mail failed with error: %s', e)

311
nasg.py Normal file → Executable file
View file

@ -8,15 +8,15 @@ import shutil
import logging import logging
import json import json
import glob import glob
import subprocess
import tempfile import tempfile
import atexit import atexit
import re import re
import hashlib import hashlib
import math import math
import asyncio import asyncio
import magic import csv
import magic
import arrow import arrow
import wand.image import wand.image
import similar_text import similar_text
@ -27,7 +27,7 @@ import requests
from breadability.readable import Article from breadability.readable import Article
from whoosh import index from whoosh import index
import jinja2 import jinja2
import urllib.parse
import shared import shared
def splitpath(path): def splitpath(path):
@ -70,13 +70,19 @@ class Indexer(object):
for url, offlinecopy in singular.offlinecopies.items(): for url, offlinecopy in singular.offlinecopies.items():
content_remote.append("%s" % offlinecopy) content_remote.append("%s" % offlinecopy)
weight = 1
if singular.isbookmark:
weight = 10
if singular.ispage:
weight = 100
self.writer.add_document( self.writer.add_document(
title=singular.title, title=singular.title,
url=singular.url, url=singular.url,
content=" ".join(list(map(str,[*content_real, *content_remote]))), content=" ".join(list(map(str,[*content_real, *content_remote]))),
date=singular.published.datetime, date=singular.published.datetime,
tags=",".join(list(map(str, singular.tags))), tags=",".join(list(map(str, singular.tags))),
weight=1, weight=weight,
img="%s" % singular.photo img="%s" % singular.photo
) )
@ -190,35 +196,6 @@ class Renderer(object):
return True return True
return False return False
#def rendersingular(self, singular):
#logging.debug("rendering and saving %s", singular.fname)
#targetdir = os.path.abspath(os.path.join(
#shared.config.get('target', 'builddir'),
#singular.fname
#))
#target = os.path.join(targetdir, 'index.html')
#if not shared.config.get('params', 'force') and os.path.isfile(target):
#ttime = int(os.path.getmtime(target))
#if ttime == singular.mtime:
#logging.debug('%s exists and up-to-date (lastmod: %d)', target, ttime)
#return
#if not os.path.isdir(targetdir):
#os.mkdir(targetdir)
#tmpl = self.j2.get_template(singular.tmplfile)
#tmplvars = {
#'post': singular.tmplvars,
#'site': self.sitevars,
#'taxonomy': {},
#}
#r = tmpl.render(tmplvars)
#with open(target, "w") as html:
#html.write(r)
#html.close()
#os.utime(target, (singular.mtime, singular.mtime))
class BaseIter(object): class BaseIter(object):
def __init__(self): def __init__(self):
@ -248,97 +225,97 @@ class BaseIter(object):
yield (k, v) yield (k, v)
return return
class CMDLine(object): #class CMDLine(object):
def __init__(self, executable): #def __init__(self, executable):
self.executable = self._which(executable) #self.executable = self._which(executable)
if self.executable is None: #if self.executable is None:
raise OSError('No %s found in PATH!' % executable) #raise OSError('No %s found in PATH!' % executable)
return #return
@staticmethod #@staticmethod
def _which(name): #def _which(name):
for d in os.environ['PATH'].split(':'): #for d in os.environ['PATH'].split(':'):
which = glob.glob(os.path.join(d, name), recursive=True) #which = glob.glob(os.path.join(d, name), recursive=True)
if which: #if which:
return which.pop() #return which.pop()
return None #return None
def __enter__(self): #def __enter__(self):
self.process = subprocess.Popen( #self.process = subprocess.Popen(
[self.executable, "-stay_open", "True", "-@", "-"], #[self.executable, "-stay_open", "True", "-@", "-"],
universal_newlines=True, #universal_newlines=True,
stdin=subprocess.PIPE, stdout=subprocess.PIPE) #stdin=subprocess.PIPE, stdout=subprocess.PIPE)
return self #return self
def __exit__(self, exc_type, exc_value, traceback): #def __exit__(self, exc_type, exc_value, traceback):
self.process.stdin.write("-stay_open\nFalse\n") #self.process.stdin.write("-stay_open\nFalse\n")
self.process.stdin.flush() #self.process.stdin.flush()
def execute(self, *args): #def execute(self, *args):
args = args + ("-execute\n",) #args = args + ("-execute\n",)
self.process.stdin.write(str.join("\n", args)) #self.process.stdin.write(str.join("\n", args))
self.process.stdin.flush() #self.process.stdin.flush()
output = "" #output = ""
fd = self.process.stdout.fileno() #fd = self.process.stdout.fileno()
while not output.endswith(self.sentinel): #while not output.endswith(self.sentinel):
output += os.read(fd, 4096).decode('utf-8', errors='ignore') #output += os.read(fd, 4096).decode('utf-8', errors='ignore')
return output[:-len(self.sentinel)] #return output[:-len(self.sentinel)]
class Pandoc(CMDLine): #class Pandoc(CMDLine):
""" Handles calling external binary `exiftool` in an efficient way """ #""" Handles calling external binary `exiftool` in an efficient way """
def __init__(self, md2html=True): #def __init__(self, md2html=True):
super().__init__('pandoc') #super().__init__('pandoc')
if md2html: #if md2html:
self.i = "markdown+" + "+".join([ #self.i = "markdown+" + "+".join([
'backtick_code_blocks', #'backtick_code_blocks',
'auto_identifiers', #'auto_identifiers',
'fenced_code_attributes', #'fenced_code_attributes',
'definition_lists', #'definition_lists',
'grid_tables', #'grid_tables',
'pipe_tables', #'pipe_tables',
'strikeout', #'strikeout',
'superscript', #'superscript',
'subscript', #'subscript',
'markdown_in_html_blocks', #'markdown_in_html_blocks',
'shortcut_reference_links', #'shortcut_reference_links',
'autolink_bare_uris', #'autolink_bare_uris',
'raw_html', #'raw_html',
'link_attributes', #'link_attributes',
'header_attributes', #'header_attributes',
'footnotes', #'footnotes',
]) #])
self.o = 'html5' #self.o = 'html5'
else: #else:
self.o = "markdown-" + "-".join([ #self.o = "markdown-" + "-".join([
'raw_html', #'raw_html',
'native_divs', #'native_divs',
'native_spans', #'native_spans',
]) #])
self.i = 'html' #self.i = 'html'
def convert(self, text): #def convert(self, text):
cmd = ( #cmd = (
self.executable, #self.executable,
'-o-', #'-o-',
'--from=%s' % self.i, #'--from=%s' % self.i,
'--to=%s' % self.o #'--to=%s' % self.o
) #)
logging.debug('converting content with Pandoc') #logging.debug('converting content with Pandoc')
p = subprocess.Popen( #p = subprocess.Popen(
cmd, #cmd,
stdin=subprocess.PIPE, #stdin=subprocess.PIPE,
stdout=subprocess.PIPE, #stdout=subprocess.PIPE,
stderr=subprocess.PIPE, #stderr=subprocess.PIPE,
) #)
stdout, stderr = p.communicate(input=text.encode()) #stdout, stderr = p.communicate(input=text.encode())
if stderr: #if stderr:
logging.error("Error during pandoc covert:\n\t%s\n\t%s", cmd, stderr) #logging.error("Error during pandoc covert:\n\t%s\n\t%s", cmd, stderr)
return stdout.decode('utf-8').strip() #return stdout.decode('utf-8').strip()
# based on http://stackoverflow.com/a/10075210 # based on http://stackoverflow.com/a/10075210
class ExifTool(CMDLine): class ExifTool(shared.CMDLine):
""" Handles calling external binary `exiftool` in an efficient way """ """ Handles calling external binary `exiftool` in an efficient way """
sentinel = "{ready}\n" sentinel = "{ready}\n"
@ -419,6 +396,7 @@ class WebImage(object):
self.alttext = '' self.alttext = ''
self.sizes = [] self.sizes = []
self.fallbacksize = int(shared.config.get('common','fallbackimg', fallback='720')) self.fallbacksize = int(shared.config.get('common','fallbackimg', fallback='720'))
self.cl = None
for size in shared.config.options('downsize'): for size in shared.config.options('downsize'):
sizeext = shared.config.get('downsize', size) sizeext = shared.config.get('downsize', size)
@ -453,7 +431,7 @@ class WebImage(object):
) )
def __str__(self): def __str__(self):
if self.is_downsizeable: if self.is_downsizeable and not self.cl:
return '\n<figure class="photo"><a target="_blank" class="adaptive" href="%s"><img src="%s" class="adaptimg" alt="%s" /></a><figcaption class=\"caption\">%s%s</figcaption></figure>\n' % ( return '\n<figure class="photo"><a target="_blank" class="adaptive" href="%s"><img src="%s" class="adaptimg" alt="%s" /></a><figcaption class=\"caption\">%s%s</figcaption></figure>\n' % (
self.target, self.target,
self.fallback, self.fallback,
@ -461,8 +439,18 @@ class WebImage(object):
self.fname, self.fname,
self.ext self.ext
) )
elif self.cl:
self.cl = self.cl.replace('.', ' ')
return '<img src="%s" class="%s" alt="%s" title="%s%s" />' % (
self.fallback,
self.cl,
self.alttext,
self.fname,
self.ext
)
else: else:
return '\n<figure class="picture"><img src="%s" class="aligncenter" alt="%s" /><figcaption class=\"caption\">%s%s</figcaption></figure>\n' % ( return '<img src="%s" class="aligncenter" alt="%s" title="%s%s" />' % (
self.fallback, self.fallback,
self.alttext, self.alttext,
self.fname, self.fname,
@ -768,10 +756,15 @@ class Content(BaseIter):
self.front = Taxonomy() self.front = Taxonomy()
def populate(self): def populate(self):
now = arrow.utcnow().timestamp
for fpath in self.files: for fpath in self.files:
item = Singular(fpath, self.images) item = Singular(fpath, self.images)
self.append(item.pubtime, item) self.append(item.pubtime, item)
if item.pubtime > now:
logging.warning("skipping future post %s", item.fname)
continue
if item.isonfront: if item.isonfront:
self.front.append(item.pubtime, item) self.front.append(item.pubtime, item)
@ -804,7 +797,7 @@ class Content(BaseIter):
'sitemap.txt' 'sitemap.txt'
) )
urls = [] urls = []
for t, item in self.data.items(): for item in self.data.values():
urls.append( "%s/%s/" % ( urls.append( "%s/%s/" % (
shared.config.get('site', 'url'), shared.config.get('site', 'url'),
item.fname item.fname
@ -814,6 +807,47 @@ class Content(BaseIter):
logging.info("writing sitemap to %s" % (target)) logging.info("writing sitemap to %s" % (target))
f.write("\n".join(urls)) f.write("\n".join(urls))
def magicphp(self, renderer):
redirects = []
gones = []
rfile = os.path.join(
shared.config.get('common', 'basedir'),
shared.config.get('common', 'redirects')
)
if os.path.isfile(rfile):
with open(rfile, newline='') as csvfile:
r = csv.reader(csvfile, delimiter=' ')
for row in r:
redirects.append((row[0], row[1]))
for item in self.data.values():
redirects.append((item.shortslug, item.fname))
rfile = os.path.join(
shared.config.get('common', 'basedir'),
shared.config.get('common', 'gone')
)
if os.path.isfile(rfile):
with open(rfile, newline='') as csvfile:
r = csv.reader(csvfile, delimiter=' ')
for row in r:
gones.append(row[0])
tmplvars = {
'redirects': redirects,
'gones': gones
}
r = renderer.j2.get_template("magic.php").render(tmplvars)
target = os.path.abspath(os.path.join(
shared.config.get('target', 'builddir'),
'magic.php'
))
with open(target, "w") as html:
logging.debug('writing %s', target)
html.write(r)
html.close()
class Singular(object): class Singular(object):
def __init__(self, path, images): def __init__(self, path, images):
logging.debug("initiating singular object from %s", path) logging.debug("initiating singular object from %s", path)
@ -874,6 +908,9 @@ class Singular(object):
logging.debug("%s not found in images", fname) logging.debug("%s not found in images", fname)
continue continue
if cl:
image.cl = cl
logging.debug( logging.debug(
"replacing %s in content with %s", "replacing %s in content with %s",
shortcode, shortcode,
@ -904,6 +941,24 @@ class Singular(object):
return reactions return reactions
@property
def urls(self):
urls = shared.URLREGEX.findall(self.content)
for reactionurls in self.reactions.values():
urls = [*urls, *reactionurls]
r = []
for link in urls:
domain = '{uri.netloc}'.format(uri=urllib.parse.urlparse(link))
if domain in shared.config.get('site', 'domains'):
continue
if r.get(link, False):
continue
r.append(link)
return r
@property @property
def lang(self): def lang(self):
lang = 'en' lang = 'en'
@ -976,7 +1031,7 @@ class Singular(object):
maybe = self.meta.get(maybe, False) maybe = self.meta.get(maybe, False)
if maybe: if maybe:
return maybe return maybe
return self.fname return ''
@property @property
def url(self): def url(self):
@ -1091,6 +1146,7 @@ class Singular(object):
'slug': self.fname, 'slug': self.fname,
'shortslug': self.shortslug, 'shortslug': self.shortslug,
'rssenclosure': self.rssenclosure, 'rssenclosure': self.rssenclosure,
'copies': self.offlinecopies,
} }
@property @property
@ -1143,6 +1199,12 @@ class NASG(object):
def __init__(self): def __init__(self):
# --- set params # --- set params
parser = argparse.ArgumentParser(description='Parameters for NASG') parser = argparse.ArgumentParser(description='Parameters for NASG')
parser.add_argument(
'--clear',
action='store_true',
default=False,
help='clear build directory in advance'
)
parser.add_argument( parser.add_argument(
'--regenerate', '--regenerate',
action='store_true', action='store_true',
@ -1217,6 +1279,13 @@ class NASG(object):
await searchdb.append(singular) await searchdb.append(singular)
def run(self): def run(self):
if shared.config.getboolean('params', 'clear'):
input('about to clear build directory, press enter to continue')
shutil.rmtree(os.path.abspath(
shared.config.get('target', 'builddir')
))
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
for d in shared.config.options('target'): for d in shared.config.options('target'):
@ -1235,8 +1304,8 @@ class NASG(object):
content = Content(images) content = Content(images)
content.populate() content.populate()
if not shared.config.getboolean('params', 'norender'):
renderer = Renderer() renderer = Renderer()
if not shared.config.getboolean('params', 'norender'):
logging.info("rendering content") logging.info("rendering content")
loop.run_until_complete(self.__acrender(content, renderer)) loop.run_until_complete(self.__acrender(content, renderer))
@ -1249,6 +1318,9 @@ class NASG(object):
logging.info("rendering sitemap") logging.info("rendering sitemap")
content.sitemap() content.sitemap()
logging.info("render magic.php")
content.magicphp(renderer)
logging.info("copy the static bits") logging.info("copy the static bits")
src = shared.config.get('source', 'staticdir') src = shared.config.get('source', 'staticdir')
for item in os.listdir(src): for item in os.listdir(src):
@ -1264,7 +1336,6 @@ class NASG(object):
loop.close() loop.close()
if __name__ == '__main__': if __name__ == '__main__':
worker = NASG() worker = NASG()
worker.run() worker.run()

4
new.py
View file

@ -36,7 +36,7 @@ if __name__ == '__main__':
now = arrow.utcnow() now = arrow.utcnow()
parser = argparse.ArgumentParser(description='create doc and print it to stdout') parser = argparse.ArgumentParser(description='create doc and print it to stdout')
parser.add_argument('--tags', '-t', help='; separated, quoted list of tags') parser.add_argument('--tags', '-t', help='; separated, quoted list of tags')
parser.add_argument('--date', '-d', help=' YYYY-mm-ddTHH:MM:SS+TZTZ formatted date, if not now') parser.add_argument('--date', '-d', help=' YYYY-mm-ddTHH:MM:SS+TZ formatted date, if not now')
parser.add_argument('--slug', '-s', help='slug (normally autogenerated from title or pubdate)') parser.add_argument('--slug', '-s', help='slug (normally autogenerated from title or pubdate)')
parser.add_argument('--title', '-l', help='title of new entry') parser.add_argument('--title', '-l', help='title of new entry')
parser.add_argument('--bookmark', '-b', help='URL to bookmark') parser.add_argument('--bookmark', '-b', help='URL to bookmark')
@ -48,7 +48,7 @@ if __name__ == '__main__':
args = vars(parser.parse_args()) args = vars(parser.parse_args())
if not args['date']: if not args['date']:
d = now.format("YYYY-MM-DDTHH:mm:ssZ") d = now.format(shared.ARROWISO)
args['date'] = input('Date [%s]: ' % (d)) or d args['date'] = input('Date [%s]: ' % (d)) or d
if not args['title']: if not args['title']:

View file

@ -3,6 +3,7 @@ appdirs==1.4.3
arrow==0.10.0 arrow==0.10.0
breadability==0.1.20 breadability==0.1.20
chardet==3.0.3 chardet==3.0.3
decorator==4.0.11
docopt==0.6.2 docopt==0.6.2
httptools==0.0.9 httptools==0.0.9
Jinja2==2.9.6 Jinja2==2.9.6
@ -23,6 +24,7 @@ ujson==1.35
unicode-slugify==0.1.3 unicode-slugify==0.1.3
Unidecode==0.4.20 Unidecode==0.4.20
uvloop==0.8.0 uvloop==0.8.0
validators==0.11.3
Wand==0.4.4 Wand==0.4.4
websockets==3.3 websockets==3.3
Whoosh==2.7.4 Whoosh==2.7.4

77
search.py Normal file
View file

@ -0,0 +1,77 @@
#!/usr/bin/env python3
import asyncio
import uvloop
import os
from sanic import Sanic
import sanic.response
from sanic.log import log as logging
from whoosh import index
from whoosh import qparser
from whoosh import fields
from whoosh import analysis
import jinja2
import shared
def SearchHandler(query, tmpl):
response = sanic.response.text(
"You seem to have forgot to enter what you want to search for. Please try again.",
status=400
)
if not query:
return response
query = query.replace('+', ' AND ').replace(' -', ' NOT ')
ix = index.open_dir(os.path.abspath(os.path.join(
shared.config.get('target', 'builddir'),
shared.config.get('var', 'searchdb')
)))
qp = qparser.MultifieldParser(
["title", "content", "tags"],
schema = shared.schema
)
q = qp.parse(query)
r = ix.searcher().search(q, sortedby="weight", limit=100)
logging.info("results for '%s': %i", query, len(r))
results = []
for result in r:
res = {
'title': result['title'],
'url': result['url'],
'highlight': result.highlights("content"),
}
if 'img' in result:
res['img'] = result['img']
results.append(res)
tvars = {
'term': query,
'posts': results,
}
logging.info("collected %i results to render", len(results))
response = sanic.response.html(tmpl.render(tvars), status=200)
return response
if __name__ == '__main__':
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
app = Sanic()
jldr = jinja2.FileSystemLoader(
searchpath=shared.config.get('source', 'templatesdir')
)
jenv = jinja2.Environment(loader=jldr)
tmpl = jenv.get_template('searchresults.html')
@app.route("/search")
async def search(request, methods=["GET"]):
query = request.args.get('s')
r = SearchHandler(query, tmpl)
return r
app.run(host="127.0.0.1", port=8001, debug=True)

View file

@ -1,8 +1,11 @@
import configparser import configparser
import os import os
import re
import glob
import logging
import subprocess
from whoosh import fields from whoosh import fields
from whoosh import analysis from whoosh import analysis
import re
def __expandconfig(config): def __expandconfig(config):
""" add the dirs to the config automatically """ """ add the dirs to the config automatically """
@ -18,6 +21,8 @@ def __expandconfig(config):
)) ))
return config return config
ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ'
URLREGEX = re.compile( URLREGEX = re.compile(
r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+' r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+'
r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*' r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*'
@ -74,3 +79,91 @@ config = configparser.ConfigParser(
) )
config.read('config.ini') config.read('config.ini')
config = __expandconfig(config) config = __expandconfig(config)
class CMDLine(object):
def __init__(self, executable):
self.executable = self._which(executable)
if self.executable is None:
raise OSError('No %s found in PATH!' % executable)
return
@staticmethod
def _which(name):
for d in os.environ['PATH'].split(':'):
which = glob.glob(os.path.join(d, name), recursive=True)
if which:
return which.pop()
return None
def __enter__(self):
self.process = subprocess.Popen(
[self.executable, "-stay_open", "True", "-@", "-"],
universal_newlines=True,
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
return self
def __exit__(self, exc_type, exc_value, traceback):
self.process.stdin.write("-stay_open\nFalse\n")
self.process.stdin.flush()
def execute(self, *args):
args = args + ("-execute\n",)
self.process.stdin.write(str.join("\n", args))
self.process.stdin.flush()
output = ""
fd = self.process.stdout.fileno()
while not output.endswith(self.sentinel):
output += os.read(fd, 4096).decode('utf-8', errors='ignore')
return output[:-len(self.sentinel)]
class Pandoc(CMDLine):
""" Handles calling external binary `exiftool` in an efficient way """
def __init__(self, md2html=True):
super().__init__('pandoc')
if md2html:
self.i = "markdown+" + "+".join([
'backtick_code_blocks',
'auto_identifiers',
'fenced_code_attributes',
'definition_lists',
'grid_tables',
'pipe_tables',
'strikeout',
'superscript',
'subscript',
'markdown_in_html_blocks',
'shortcut_reference_links',
'autolink_bare_uris',
'raw_html',
'link_attributes',
'header_attributes',
'footnotes',
])
self.o = 'html5'
else:
self.o = "markdown-" + "-".join([
'raw_html',
'native_divs',
'native_spans',
])
self.i = 'html'
def convert(self, text):
cmd = (
self.executable,
'-o-',
'--from=%s' % self.i,
'--to=%s' % self.o
)
logging.debug('converting content with Pandoc')
p = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = p.communicate(input=text.encode())
if stderr:
logging.error("Error during pandoc covert:\n\t%s\n\t%s", cmd, stderr)
return stdout.decode('utf-8').strip()

193
webmention.py Normal file
View file

@ -0,0 +1,193 @@
import asyncio
import uvloop
import os
import hashlib
import json
import urllib.parse
import frontmatter
from sanic import Sanic
import sanic.response
from sanic.log import log as logging
import validators
import arrow
from webmentiontools import urlinfo
import shared
import envelope
class WebmentionHandler(object):
def __init__ (self, source, target):
self.source = source
self.target = target
self.now = arrow.utcnow().timestamp
logging.info("incoming webmention %s => %s", self.source, self.target)
self.r = sanic.response.text(
"something went wrong on my side, could you please let me know at hello@petermolnar.eu ?",
status=500
)
def run(self):
if not self._validate():
return
self._parse()
self._save()
self._notify()
def _validate(self):
test = {
self.source: '"souce" parameter is an invalid URL',
self.target: '"target" parameter is an invalid URL'
}
for url, emsg in test.items():
logging.debug("validating URL %s", url)
if not validators.url(url):
self.r = sanic.response.text(
emsg,
status=400
)
return False
logging.debug("checking target domain")
_target = urllib.parse.urlparse(self.target)
_target_domain = '{uri.netloc}'.format(uri=_target)
_mydomains = shared.config.get('site', 'domains').split(" ")
if not _target_domain in _mydomains:
self.r = sanic.response.text(
"'target' is not in the list of allowed domains",
status=400
)
return False
logging.debug("checking selfpings")
_source = urllib.parse.urlparse(self.source)
_source_domain = '{uri.netloc}'.format(uri=_source)
if _source_domain in _mydomains:
self.r = sanic.response.text(
"selfpings are not allowed",
status=400
)
return False
return True
def _parse(self):
logging.debug("fetching %s", self.source)
self._source = urlinfo.UrlInfo(self.source)
if self._source.error:
self.r = sanic.response.text(
"couldn't fetch 'source' from %s" % (self.source),
status=408
)
return False
self.source = self._source.realurl
if not self._source.linksTo(self.target):
self.r = sanic.response.text(
"'source' (%s) does not link to 'target' (%s)" % (
self.source,
self.target
),
status=400
)
return False
logging.debug("fetching %s", self.target)
self._target = urlinfo.UrlInfo(self.target)
if self._target.error:
self.r = sanic.response.text(
"couldn't fetch 'target' from %s" % (self.target),
status=408
)
self.target = self._target.realurl
#logging.info("parsed webmention:\n%s\n\n%s", self.meta, self.content)
def _save(self):
doc = frontmatter.loads('')
doc.metadata = self.meta
doc.content = self.content
target = os.path.join(
shared.config.get('source', 'commentsdir'),
self.mhash
)
if os.path.isfile(target):
logging.warning('updating existing webmention %s', target)
else:
logging.warning('saving incoming webmention to %s', target)
with open(target, 'wt') as t:
t.write(frontmatter.dumps(doc))
self.r = sanic.response.text(
"accepted",
status=202
)
def _notify(self):
text = "# webmention\n## Source\n\nauthor\n: %s\n\nURL\n: %s\n\nemail\n: %s\n\ndate\n: %s\n\n## Target\n\nURL\n: %s\n\n---\n\n%s" % (
self._meta['author'].get('name', self.source),
self._meta['author'].get('url', self.source),
self._meta['author'].get('email', ''),
self._meta['date'],
self.target,
self.content
)
l = envelope.Letter(
sender=(
shared.config.get('webmention', 'from_name'),
shared.config.get('webmention', 'from_address')
),
recipient=(
shared.config.get('webmention', 'to_name'),
shared.config.get('webmention', 'to_address')
),
subject="[webmention] %s" % self.source,
text=text
)
l.make()
l.send()
@property
def mhash(self):
return hashlib.sha1(json.dumps(self.meta, sort_keys=True).encode('utf-8')).hexdigest()
@property
def meta(self):
if hasattr(self, '_meta'):
return self._meta
self._meta = {
'author': self._source.author(),
'type': self._source.relationType(),
'target': self.target,
'source': self.source,
'date': arrow.get(self._source.pubDate()).format(shared.ARROWISO),
}
return self._meta
@property
def content(self):
if hasattr(self, '_content'):
return self._content
# from HTML to Markdown
self._content = shared.Pandoc(False).convert(self._source.content())
# from Markdown back to HTML
#self._content = shared.Pandoc().convert(tmpcontent)
return self._content
if __name__ == '__main__':
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
app = Sanic()
@app.route("/webmention", methods=["POST"])
async def wm(request):
source = request.form.get('source')
target = request.form.get('target')
r = WebmentionHandler(source, target)
r.run()
return r.r
app.run(host="127.0.0.1", port=8002, debug=True)