Back To Pandoc

So, Python Markdown is a bottomless pit of horrors, including crippling parsing bugs,
random   out of nowhere, lack of features. It's definitely much faster, than
Pandoc, but Pandoc doesn't go full retard where there's a regex in a fenced code block,
that happens to be a regex for markdown elements.

Also added some ugly post string replacements to make Pandoc fenced code output work
with Prism:
instead of the Pandoc <pre class="codelang"><code>, Prism wants
<pre><code class="language-codelang>, so I added a regex sub, because it's 00:32.
This commit is contained in:
Peter Molnar 2018-08-04 00:28:55 +01:00
parent 96d0c238d6
commit d3fbf2e51f
8 changed files with 181 additions and 156 deletions

View file

@ -1,57 +0,0 @@
"""
This is a simplified FencedBlockPreprocessor which outputs "proper" <code>
naming, eg. language-python, instead of just python, so prism.js understands
it.
It doesn't deal with CodeHilite.
"""
from markdown.preprocessors import Preprocessor
from markdown.extensions import Extension
from markdown.extensions.fenced_code import FencedBlockPreprocessor
class HTML5FencedBlockPreprocessor(Preprocessor):
FENCED_BLOCK_RE = FencedBlockPreprocessor.FENCED_BLOCK_RE
CODE_WRAP = '<pre><code%s>%s</code></pre>'
LANG_TAG = ' class="language-%s"'
def __init__(self, md):
super(HTML5FencedBlockPreprocessor, self).__init__(md)
def run(self, lines):
text = "\n".join(lines)
while 1:
m = self.FENCED_BLOCK_RE.search(text)
if m:
lang = ''
if m.group('lang'):
lang = self.LANG_TAG % (m.group('lang'))
code = self.CODE_WRAP % (
lang,
m.group('code')
)
placeholder = self.markdown.htmlStash.store(code)
text = '%s\n%s\n%s' % (
text[:m.start()],
placeholder,
text[m.end():]
)
else:
break
return text.split("\n")
class HTML5FencedCodeExtension(Extension):
def extendMarkdown(self, md, md_globals):
md.registerExtension(self)
md.preprocessors.add(
'html5_fenced_code',
HTML5FencedBlockPreprocessor(md),
">normalize_whitespace"
)
def makeExtension(*args, **kwargs):
return HTML5FencedCodeExtension(*args, **kwargs)

122
nasg.py
View file

@ -10,7 +10,7 @@ __email__ = "mail@petermolnar.net"
import glob import glob
import os import os
import time import time
from functools import lru_cache as cached from functools import partial
import re import re
import imghdr import imghdr
import logging import logging
@ -26,18 +26,15 @@ import langdetect
import wand.image import wand.image
import jinja2 import jinja2
import frontmatter import frontmatter
import markdown
from feedgen.feed import FeedGenerator from feedgen.feed import FeedGenerator
from bleach import clean from bleach import clean
from emoji import UNICODE_EMOJI from emoji import UNICODE_EMOJI
from slugify import slugify from slugify import slugify
import requests import requests
from pandoc import pandoc
import exiftool import exiftool
import settings import settings
import keys import keys
import html5_fenced_code
from pprint import pprint
MarkdownImage = namedtuple( MarkdownImage = namedtuple(
'MarkdownImage', 'MarkdownImage',
@ -56,33 +53,37 @@ RE_MDIMG = re.compile(
re.IGNORECASE re.IGNORECASE
) )
RE_HTTP = re.compile(
r'^https?://',
re.IGNORECASE
)
MD = markdown.Markdown(
output_format='xhtml5',
extensions=[
'html5_fenced_code',
'abbr',
'attr_list',
'def_list',
'footnotes',
'tables',
'smart_strong',
'headerid',
'urlize',
]
)
RE_CODE = re.compile( RE_CODE = re.compile(
r'(?:[~`]{3})(?:[^`]+)?' r'(?:[~`]{3})(?:[^`]+)?'
) )
RE_PRECODE = re.compile(
r'<pre class="([^"]+)"><code>'
)
class cached_property(object):
def __init__(self, method, name=None):
# record the unbound-method and the name
self.method = method
self.name = name or method.__name__
self.__doc__ = method.__doc__
def __get__(self, inst, cls):
# self: <__main__.cache object at 0xb781340c>
# inst: <__main__.Foo object at 0xb781348c>
# cls: <class '__main__.Foo'>
if inst is None:
# instance attribute accessed on class, return self
# You get here if you write `Foo.bar`
return self
# compute, cache and return the instance's attribute value
result = self.method(inst)
# setattr redefines the instance's attribute so this doesn't get called again
setattr(inst, self.name, result)
return result
class MarkdownDoc(object): class MarkdownDoc(object):
@property @cached_property
@cached()
def _parsed(self): def _parsed(self):
with open(self.fpath, mode='rt') as f: with open(self.fpath, mode='rt') as f:
logging.debug('parsing YAML+MD file %s', self.fpath) logging.debug('parsing YAML+MD file %s', self.fpath)
@ -97,14 +98,16 @@ class MarkdownDoc(object):
def content(self): def content(self):
return self._parsed[1] return self._parsed[1]
@property @cached_property
@cached()
def html_content(self): def html_content(self):
c = "%s" % (self.content) c = "%s" % (self.content)
if hasattr(self, 'images') and len(self.images): if hasattr(self, 'images') and len(self.images):
for match, img in self.images.items(): for match, img in self.images.items():
c = c.replace(match, str(img)) c = c.replace(match, str(img))
return MD.reset().convert(c) # return MD.reset().convert(c)
c = pandoc(c)
c = RE_PRECODE.sub('<pre><code lang="\g<1>" class="language-\g<1>">', c)
return c
class Comment(MarkdownDoc): class Comment(MarkdownDoc):
@ -188,8 +191,7 @@ class Redirect(Gone):
Redirect object for entries that moved Redirect object for entries that moved
""" """
@property @cached_property
@cached()
def target(self): def target(self):
target = '' target = ''
with open(self.fpath, 'rt') as f: with open(self.fpath, 'rt') as f:
@ -219,8 +221,7 @@ class Singular(MarkdownDoc):
ret = ctime ret = ctime
return ret return ret
@property @cached_property
@cached()
def files(self): def files(self):
""" """
An array of files present at the same directory level as An array of files present at the same directory level as
@ -233,8 +234,7 @@ class Singular(MarkdownDoc):
if not k.endswith('.md') and not k.startswith('.') if not k.endswith('.md') and not k.startswith('.')
] ]
@property @cached_property
@cached()
def comments(self): def comments(self):
""" """
An dict of Comment objects keyed with their path, populated from the An dict of Comment objects keyed with their path, populated from the
@ -251,8 +251,7 @@ class Singular(MarkdownDoc):
comments[c.dt.timestamp] = c comments[c.dt.timestamp] = c
return comments return comments
@property @cached_property
@cached()
def images(self): def images(self):
""" """
A dict of WebImage objects, populated by: A dict of WebImage objects, populated by:
@ -317,10 +316,10 @@ class Singular(MarkdownDoc):
def summary(self): def summary(self):
return self.meta.get('summary', '') return self.meta.get('summary', '')
@property @cached_property
@cached()
def html_summary(self): def html_summary(self):
return MD.reset().convert(self.summary) # return MD.reset().convert(self.summary)
return pandoc(self.summary)
@property @property
def title(self): def title(self):
@ -428,8 +427,7 @@ class Singular(MarkdownDoc):
else: else:
return False return False
@property @cached_property
@cached()
def tmplvars(self): def tmplvars(self):
v = { v = {
'title': self.title, 'title': self.title,
@ -548,8 +546,7 @@ class WebImage(object):
'is_photo': self.is_photo, 'is_photo': self.is_photo,
}) })
@property @cached_property
@cached()
def meta(self): def meta(self):
return exiftool.Exif(self.fpath) return exiftool.Exif(self.fpath)
@ -844,7 +841,7 @@ class AsyncWorker(object):
self._tasks = [] self._tasks = []
self._loop = asyncio.get_event_loop() self._loop = asyncio.get_event_loop()
def append(self, job): def add(self, job):
task = self._loop.create_task(job) task = self._loop.create_task(job)
self._tasks.append(task) self._tasks.append(task)
@ -872,7 +869,7 @@ class IndexPHP(object):
if target in self.gone: if target in self.gone:
self.add_gone(source) self.add_gone(source)
else: else:
if not RE_HTTP.match(target): if '://' not in target:
target = "%s/%s" % (settings.site.get('url'), target) target = "%s/%s" % (settings.site.get('url'), target)
self.redirect[source] = target self.redirect[source] = target
@ -1003,6 +1000,7 @@ class Category(dict):
fg = FeedGenerator() fg = FeedGenerator()
fg.id(self.feed) fg.id(self.feed)
fg.link(href=self.feed, rel='self') fg.link(href=self.feed, rel='self')
fg.link(href=settings.meta.get('hub'), rel='hub')
fg.title(self.title) fg.title(self.title)
fg.author({ fg.author({
'name': settings.author.get('name'), 'name': settings.author.get('name'),
@ -1014,6 +1012,10 @@ class Category(dict):
for post in self.get_posts(start, end): for post in self.get_posts(start, end):
dt = arrow.get(post.get('pubtime')) dt = arrow.get(post.get('pubtime'))
fe = fg.add_entry() fe = fg.add_entry()
fe.author({
'name': settings.author.get('name'),
'email':settings.author.get('email')
})
fe.id(post.get('url')) fe.id(post.get('url'))
fe.link(href=post.get('url')) fe.link(href=post.get('url'))
fe.title(post.get('title')) fe.title(post.get('title'))
@ -1021,7 +1023,7 @@ class Category(dict):
fe.updated(dt.datetime) fe.updated(dt.datetime)
fe.content( fe.content(
post.get('html_content'), post.get('html_content'),
type='CDATA' #src=post.get('url')
) )
fe.rights('%s %s %s' % ( fe.rights('%s %s %s' % (
post.get('licence').upper(), post.get('licence').upper(),
@ -1035,15 +1037,15 @@ class Category(dict):
"%d" % enc.get('size'), "%d" % enc.get('size'),
enc.get('mime') enc.get('mime')
) )
atom = os.path.join(dirname, 'index.xml') atom = os.path.join(dirname, 'index.xml')
with open(atom, 'wb') as f: with open(atom, 'wb') as f:
logging.info('writing file: %s', atom) logging.info('writing file: %s', atom)
f.write(fg.atom_str(pretty=True)) f.write(fg.atom_str(pretty=True))
jsfile = os.path.join(dirname, 'index.json')
def render_page(self, pagenum=1, pages=1): def render_page(self, pagenum=1, pages=1):
if self.display == 'flat': if self.display == 'flat':
start = 1 start = 0
end = -1 end = -1
else: else:
pagination = int(settings.site.get('pagination')) pagination = int(settings.site.get('pagination'))
@ -1201,7 +1203,7 @@ class Sitemap(dict):
def renderfile(self): def renderfile(self):
return os.path.join(settings.paths.get('build'), 'sitemap.txt') return os.path.join(settings.paths.get('build'), 'sitemap.txt')
async def save(self): async def render(self):
if self.mtime >= sorted(self.values())[-1]: if self.mtime >= sorted(self.values())[-1]:
return return
with open(self.renderfile, 'wt') as f: with open(self.renderfile, 'wt') as f:
@ -1274,8 +1276,8 @@ def make():
content = settings.paths.get('content') content = settings.paths.get('content')
worker = AsyncWorker() worker = AsyncWorker()
rules = IndexPHP() rules = IndexPHP()
for e in glob.glob(os.path.join(content, '*', '*.ptr')): for e in glob.glob(os.path.join(content, '*', '*.ptr')):
post = Gone(e) post = Gone(e)
if post.mtime > last: if post.mtime > last:
@ -1287,8 +1289,8 @@ def make():
last = post.mtime last = post.mtime
rules.add_redirect(post.source, post.target) rules.add_redirect(post.source, post.target)
if rules.mtime < last: if rules.mtime < last or settings.args.get('force'):
worker.append(rules.render()) worker.add(rules.render())
sitemap = Sitemap() sitemap = Sitemap()
search = Search() search = Search()
@ -1297,10 +1299,10 @@ def make():
for e in sorted(glob.glob(os.path.join(content, '*', '*', 'index.md'))): for e in sorted(glob.glob(os.path.join(content, '*', '*', 'index.md'))):
post = Singular(e) post = Singular(e)
worker.append(post.render()) worker.add(post.copyfiles())
worker.append(post.copyfiles())
for i in post.images.values(): for i in post.images.values():
worker.append(i.downsize()) worker.add(i.downsize())
worker.add(post.render())
if post.is_future: if post.is_future:
continue continue
else: else:
@ -1322,11 +1324,11 @@ def make():
) )
search.__exit__() search.__exit__()
worker.append(search.render()) search.render()
for category in categories.values(): for category in categories.values():
worker.append(category.render()) worker.add(category.render())
worker.append(sitemap.save()) worker.add(sitemap.render())
worker.run() worker.run()
logging.info('worker finished') logging.info('worker finished')

44
pandoc.py Normal file
View file

@ -0,0 +1,44 @@
import subprocess
import logging
def pandoc(text):
# TODO: cache?
# import hashlib
# print(hashlib.md5("whatever your string is".encode('utf-8')).hexdigest())
""" Pandoc command line call with piped in- and output """
cmd = (
'pandoc',
'-o-',
'--from=markdown+%s' % (
'+'.join([
'footnotes',
'pipe_tables',
'raw_html',
'definition_lists',
'backtick_code_blocks',
'fenced_code_attributes',
'shortcut_reference_links',
'lists_without_preceding_blankline',
'autolink_bare_uris',
])
),
'--to=html5',
'--quiet',
'--no-highlight'
)
p = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = p.communicate(input=text.encode())
if stderr:
logging.warning(
"Error during pandoc covert:\n\t%s\n\t%s",
cmd,
stderr
)
return stdout.decode('utf-8').strip()

View file

@ -1,26 +1,10 @@
arrow==0.12.1 arrow==0.12.1
bleach==2.1.3 bleach==2.1.3
certifi==2018.4.16
chardet==3.0.4
decorator==4.3.0
emoji==0.5.0 emoji==0.5.0
feedgen==0.7.0 feedgen==0.7.0
html5lib==1.0.1
idna==2.7
Jinja2==2.10 Jinja2==2.10
langdetect==1.0.7 langdetect==1.0.7
lxml==4.2.3
Markdown==2.6.11
markdown-urlize==0.2.0
MarkupSafe==1.0
Pygments==2.2.0
python-dateutil==2.7.3
python-frontmatter==0.4.2 python-frontmatter==0.4.2
PyYAML==3.13
requests==2.19.1 requests==2.19.1
six==1.11.0
unicode-slugify==0.1.3 unicode-slugify==0.1.3
Unidecode==1.0.22
urllib3==1.23
Wand==0.4.4 Wand==0.4.4
webencodings==0.5.1

View file

@ -6,12 +6,28 @@ $redirects = array(
{% endfor %} {% endfor %}
); );
$redirects_re = array(
'^(?:sysadmin|it|linux-tech-coding|sysadmin-blog)\/?(page.*)?$' => 'category/article/',
'^(?:fotography|photoblog)\/?(page.*)?$' => '/category/photo/$1',
'^blog\/?(page.*)?$' => '/category/journal/',
'^blips\/?(page.*)?$' => '/category/note/$1',
'^r\/?(page.*)?$' => '/category/note/$1',
'^(?:linux-tech-coding|it|sysadmin-blog|sysadmin|fotography|blips|blog|photoblog|article|journal|photo|note|r)\/((?!page).*)' => '/$1',
);
$gone = array( $gone = array(
{% for gone in gones %} {% for gone in gones %}
"{{ gone }}" => true, "{{ gone }}" => true,
{% endfor %} {% endfor %}
); );
$gone_re = array(
'^cache/.*$',
'^files/.*$',
'^wp-content/.*$',
'^broadcast\/wp-ffpc.message$',
);
function redirect_to($uri) { function redirect_to($uri) {
header('HTTP/1.1 301 Moved Permanently'); header('HTTP/1.1 301 Moved Permanently');
@ -78,11 +94,33 @@ $uri = str_replace('/feed/', '', $uri);
$uri = str_replace('/atom/', '', $uri); $uri = str_replace('/atom/', '', $uri);
$uri = trim($uri, '/'); $uri = trim($uri, '/');
if (isset($gone[$uri])) foreach ($gone_re as $pattern) {
if (preg_match(sprintf('/%s/', $pattern), $uri)) {
gone($uri);
}
}
foreach ($redirects_re as $pattern => $target) {
$maybe = preg_match(sprintf('/%s/i', $pattern), $uri, $matches);
if ($maybe) {
$target = str_replace('$1', $matches[1], $target);
redirect_to($target);
}
}
/* "logic" */
if (isset($gone[$uri])) {
gone($uri); gone($uri);
elseif (isset($redirects[$uri])) }
elseif (isset($redirects[$uri])) {
redirect_to($redirects[$uri]); redirect_to($redirects[$uri]);
elseif (strstr($uri, '_')) }
elseif (preg_match('/^\.well-known\/(host-meta|webfinger).*$/', $uri)) {
redirect_to("https://fed.brid.gy/{$uri}");
}
elseif (strstr($uri, '_')) {
maybe_redirect(str_replace('_', '-', $uri)); maybe_redirect(str_replace('_', '-', $uri));
else }
else {
notfound(); notfound();
}

View file

@ -5,3 +5,11 @@
<link rel="canonical" href="{{ post.url }}" /> <link rel="canonical" href="{{ post.url }}" />
<link rel="license" href="https://creativecommons.org/licenses/4.0/{{ post.licence }}" /> <link rel="license" href="https://creativecommons.org/licenses/4.0/{{ post.licence }}" />
{% endblock %} {% endblock %}
{% block prism %}
{% if post.has_code %}
<style media="all">
{% include 'prism.css' %}
</style>
<script src="{{ site.url }}/prism.js"></script>
{% endif %}
{% endblock %}

View file

@ -33,14 +33,11 @@
} }
localStorage.setItem("stylesheet", setto); localStorage.setItem("stylesheet", setto);
e.setAttribute("media", setto); e.setAttribute("media", setto);
return false;
} }
</script> </script>
{% if post.has_code %} {% block prism %}
<style media="all"> {% endblock %}
{% include 'prism.css' %}
</style>
<script src="{{ site.url }}/prism.js"></script>
{% endif %}
</head> </head>
<body> <body>
@ -90,7 +87,7 @@
<p class="contrast"> <p class="contrast">
<a title="toggle site colour scheme" href="#" <a title="toggle site colour scheme" href="#"
onclick="toggleStylesheet(this)"> onclick="return toggleStylesheet(this)">
<svg class="icon" width="16" height="16"> <svg class="icon" width="16" height="16">
<use xlink:href="#icon-contrast" /> <use xlink:href="#icon-contrast" />
</svg> </svg>
@ -169,9 +166,7 @@
<a class="fn p-name url u-url u-uid" href="{{ author.url }}"> <a class="fn p-name url u-url u-uid" href="{{ author.url }}">
{{ author.name }} {{ author.name }}
</a> </a>
&lt;<a rel="me" class="u-email email" href="mailto:{{ author.email }}"> &lt;<a rel="me" class="u-email email" href="mailto:{{ author.email }}">{{ author.email }}</a>&gt;
{{ author.email }}
</a>&gt;
</p> </p>
</dd> </dd>
@ -422,7 +417,7 @@
</nav> </nav>
<div class="webring"> <div class="webring">
<a href="https://xn--sr8hvo.ws/🇻🇮📢/previous"></a> <a href="https://xn--sr8hvo.ws/🇻🇮📢/previous"></a>
Member of <a href="https://xn--sr8hvo.ws">IndieWeb Webring</a> 🕸💍 Member of <a href="https://xn--sr8hvo.ws">IndieWeb Webring</a>
<a href="https://xn--sr8hvo.ws/🇻🇮📢/next"></a> <a href="https://xn--sr8hvo.ws/🇻🇮📢/next"></a>
</div> </div>
</div> </div>

View file

@ -25,7 +25,7 @@ svg {
width: 16px; width: 16px;
height: 16px; height: 16px;
fill: currentColor; fill: currentColor;
vertical-align: text-top; vertical-align: middle;
} }
a { a {
@ -41,6 +41,10 @@ h1 {
font-size: 1.6em; font-size: 1.6em;
} }
h1, h2 {
line-height: 1.2em;
}
h2, h3 { h2, h3 {
margin-top: 2em; margin-top: 2em;
} }
@ -153,6 +157,7 @@ code, pre {
pre { pre {
padding: 0.6em; padding: 0.6em;
position: relative;
} }
code { code {
@ -163,6 +168,11 @@ pre > code {
border: none; border: none;
} }
pre> code::before {
content: attr(lang);
float: right;
}
table { table {
border-collapse: collapse; border-collapse: collapse;
border-spacing: 0; border-spacing: 0;
@ -280,16 +290,11 @@ body > footer dd {
margin: -1.3em 0 0 4em; margin: -1.3em 0 0 4em;
} }
body > footer nav {
display: flex;
justify-content: space-between;
}
.webring { .webring {
text-align: center; text-align: center;
} }
.footnote a { .footnotes a {
display: inline-block; display: inline-block;
overflow: hidden; overflow: hidden;
white-space: nowrap; white-space: nowrap;
@ -298,7 +303,7 @@ body > footer nav {
max-width: 80%; max-width: 80%;
} }
.footnote-ref { .footnote-back {
margin: 0 0 0 0.1em; margin: 0 0 0 0.1em;
} }
@ -306,6 +311,7 @@ body > footer nav {
.follow { .follow {
position: fixed; position: fixed;
right: 1em; right: 1em;
z-index: 100;
} }
.contrast { .contrast {
@ -334,4 +340,9 @@ body > footer nav {
body > header form { body > header form {
margin: 0; margin: 0;
} }
body > footer nav {
display: flex;
justify-content: space-between;
}
} }