Back To Pandoc

So, Python Markdown is a bottomless pit of horrors, including crippling parsing bugs,
random   out of nowhere, lack of features. It's definitely much faster, than
Pandoc, but Pandoc doesn't go full retard where there's a regex in a fenced code block,
that happens to be a regex for markdown elements.

Also added some ugly post string replacements to make Pandoc fenced code output work
with Prism:
instead of the Pandoc <pre class="codelang"><code>, Prism wants
<pre><code class="language-codelang>, so I added a regex sub, because it's 00:32.
This commit is contained in:
Peter Molnar 2018-08-04 00:28:55 +01:00
parent 96d0c238d6
commit d3fbf2e51f
8 changed files with 181 additions and 156 deletions

View file

@ -1,57 +0,0 @@
"""
This is a simplified FencedBlockPreprocessor which outputs "proper" <code>
naming, eg. language-python, instead of just python, so prism.js understands
it.
It doesn't deal with CodeHilite.
"""
from markdown.preprocessors import Preprocessor
from markdown.extensions import Extension
from markdown.extensions.fenced_code import FencedBlockPreprocessor
class HTML5FencedBlockPreprocessor(Preprocessor):
FENCED_BLOCK_RE = FencedBlockPreprocessor.FENCED_BLOCK_RE
CODE_WRAP = '<pre><code%s>%s</code></pre>'
LANG_TAG = ' class="language-%s"'
def __init__(self, md):
super(HTML5FencedBlockPreprocessor, self).__init__(md)
def run(self, lines):
text = "\n".join(lines)
while 1:
m = self.FENCED_BLOCK_RE.search(text)
if m:
lang = ''
if m.group('lang'):
lang = self.LANG_TAG % (m.group('lang'))
code = self.CODE_WRAP % (
lang,
m.group('code')
)
placeholder = self.markdown.htmlStash.store(code)
text = '%s\n%s\n%s' % (
text[:m.start()],
placeholder,
text[m.end():]
)
else:
break
return text.split("\n")
class HTML5FencedCodeExtension(Extension):
def extendMarkdown(self, md, md_globals):
md.registerExtension(self)
md.preprocessors.add(
'html5_fenced_code',
HTML5FencedBlockPreprocessor(md),
">normalize_whitespace"
)
def makeExtension(*args, **kwargs):
return HTML5FencedCodeExtension(*args, **kwargs)

122
nasg.py
View file

@ -10,7 +10,7 @@ __email__ = "mail@petermolnar.net"
import glob
import os
import time
from functools import lru_cache as cached
from functools import partial
import re
import imghdr
import logging
@ -26,18 +26,15 @@ import langdetect
import wand.image
import jinja2
import frontmatter
import markdown
from feedgen.feed import FeedGenerator
from bleach import clean
from emoji import UNICODE_EMOJI
from slugify import slugify
import requests
from pandoc import pandoc
import exiftool
import settings
import keys
import html5_fenced_code
from pprint import pprint
MarkdownImage = namedtuple(
'MarkdownImage',
@ -56,33 +53,37 @@ RE_MDIMG = re.compile(
re.IGNORECASE
)
RE_HTTP = re.compile(
r'^https?://',
re.IGNORECASE
)
MD = markdown.Markdown(
output_format='xhtml5',
extensions=[
'html5_fenced_code',
'abbr',
'attr_list',
'def_list',
'footnotes',
'tables',
'smart_strong',
'headerid',
'urlize',
]
)
RE_CODE = re.compile(
r'(?:[~`]{3})(?:[^`]+)?'
)
RE_PRECODE = re.compile(
r'<pre class="([^"]+)"><code>'
)
class cached_property(object):
def __init__(self, method, name=None):
# record the unbound-method and the name
self.method = method
self.name = name or method.__name__
self.__doc__ = method.__doc__
def __get__(self, inst, cls):
# self: <__main__.cache object at 0xb781340c>
# inst: <__main__.Foo object at 0xb781348c>
# cls: <class '__main__.Foo'>
if inst is None:
# instance attribute accessed on class, return self
# You get here if you write `Foo.bar`
return self
# compute, cache and return the instance's attribute value
result = self.method(inst)
# setattr redefines the instance's attribute so this doesn't get called again
setattr(inst, self.name, result)
return result
class MarkdownDoc(object):
@property
@cached()
@cached_property
def _parsed(self):
with open(self.fpath, mode='rt') as f:
logging.debug('parsing YAML+MD file %s', self.fpath)
@ -97,14 +98,16 @@ class MarkdownDoc(object):
def content(self):
return self._parsed[1]
@property
@cached()
@cached_property
def html_content(self):
c = "%s" % (self.content)
if hasattr(self, 'images') and len(self.images):
for match, img in self.images.items():
c = c.replace(match, str(img))
return MD.reset().convert(c)
# return MD.reset().convert(c)
c = pandoc(c)
c = RE_PRECODE.sub('<pre><code lang="\g<1>" class="language-\g<1>">', c)
return c
class Comment(MarkdownDoc):
@ -188,8 +191,7 @@ class Redirect(Gone):
Redirect object for entries that moved
"""
@property
@cached()
@cached_property
def target(self):
target = ''
with open(self.fpath, 'rt') as f:
@ -219,8 +221,7 @@ class Singular(MarkdownDoc):
ret = ctime
return ret
@property
@cached()
@cached_property
def files(self):
"""
An array of files present at the same directory level as
@ -233,8 +234,7 @@ class Singular(MarkdownDoc):
if not k.endswith('.md') and not k.startswith('.')
]
@property
@cached()
@cached_property
def comments(self):
"""
An dict of Comment objects keyed with their path, populated from the
@ -251,8 +251,7 @@ class Singular(MarkdownDoc):
comments[c.dt.timestamp] = c
return comments
@property
@cached()
@cached_property
def images(self):
"""
A dict of WebImage objects, populated by:
@ -317,10 +316,10 @@ class Singular(MarkdownDoc):
def summary(self):
return self.meta.get('summary', '')
@property
@cached()
@cached_property
def html_summary(self):
return MD.reset().convert(self.summary)
# return MD.reset().convert(self.summary)
return pandoc(self.summary)
@property
def title(self):
@ -428,8 +427,7 @@ class Singular(MarkdownDoc):
else:
return False
@property
@cached()
@cached_property
def tmplvars(self):
v = {
'title': self.title,
@ -548,8 +546,7 @@ class WebImage(object):
'is_photo': self.is_photo,
})
@property
@cached()
@cached_property
def meta(self):
return exiftool.Exif(self.fpath)
@ -844,7 +841,7 @@ class AsyncWorker(object):
self._tasks = []
self._loop = asyncio.get_event_loop()
def append(self, job):
def add(self, job):
task = self._loop.create_task(job)
self._tasks.append(task)
@ -872,7 +869,7 @@ class IndexPHP(object):
if target in self.gone:
self.add_gone(source)
else:
if not RE_HTTP.match(target):
if '://' not in target:
target = "%s/%s" % (settings.site.get('url'), target)
self.redirect[source] = target
@ -1003,6 +1000,7 @@ class Category(dict):
fg = FeedGenerator()
fg.id(self.feed)
fg.link(href=self.feed, rel='self')
fg.link(href=settings.meta.get('hub'), rel='hub')
fg.title(self.title)
fg.author({
'name': settings.author.get('name'),
@ -1014,6 +1012,10 @@ class Category(dict):
for post in self.get_posts(start, end):
dt = arrow.get(post.get('pubtime'))
fe = fg.add_entry()
fe.author({
'name': settings.author.get('name'),
'email':settings.author.get('email')
})
fe.id(post.get('url'))
fe.link(href=post.get('url'))
fe.title(post.get('title'))
@ -1021,7 +1023,7 @@ class Category(dict):
fe.updated(dt.datetime)
fe.content(
post.get('html_content'),
type='CDATA'
#src=post.get('url')
)
fe.rights('%s %s %s' % (
post.get('licence').upper(),
@ -1035,15 +1037,15 @@ class Category(dict):
"%d" % enc.get('size'),
enc.get('mime')
)
atom = os.path.join(dirname, 'index.xml')
with open(atom, 'wb') as f:
logging.info('writing file: %s', atom)
f.write(fg.atom_str(pretty=True))
jsfile = os.path.join(dirname, 'index.json')
def render_page(self, pagenum=1, pages=1):
if self.display == 'flat':
start = 1
start = 0
end = -1
else:
pagination = int(settings.site.get('pagination'))
@ -1201,7 +1203,7 @@ class Sitemap(dict):
def renderfile(self):
return os.path.join(settings.paths.get('build'), 'sitemap.txt')
async def save(self):
async def render(self):
if self.mtime >= sorted(self.values())[-1]:
return
with open(self.renderfile, 'wt') as f:
@ -1274,8 +1276,8 @@ def make():
content = settings.paths.get('content')
worker = AsyncWorker()
rules = IndexPHP()
for e in glob.glob(os.path.join(content, '*', '*.ptr')):
post = Gone(e)
if post.mtime > last:
@ -1287,8 +1289,8 @@ def make():
last = post.mtime
rules.add_redirect(post.source, post.target)
if rules.mtime < last:
worker.append(rules.render())
if rules.mtime < last or settings.args.get('force'):
worker.add(rules.render())
sitemap = Sitemap()
search = Search()
@ -1297,10 +1299,10 @@ def make():
for e in sorted(glob.glob(os.path.join(content, '*', '*', 'index.md'))):
post = Singular(e)
worker.append(post.render())
worker.append(post.copyfiles())
worker.add(post.copyfiles())
for i in post.images.values():
worker.append(i.downsize())
worker.add(i.downsize())
worker.add(post.render())
if post.is_future:
continue
else:
@ -1322,11 +1324,11 @@ def make():
)
search.__exit__()
worker.append(search.render())
search.render()
for category in categories.values():
worker.append(category.render())
worker.add(category.render())
worker.append(sitemap.save())
worker.add(sitemap.render())
worker.run()
logging.info('worker finished')

44
pandoc.py Normal file
View file

@ -0,0 +1,44 @@
import subprocess
import logging
def pandoc(text):
# TODO: cache?
# import hashlib
# print(hashlib.md5("whatever your string is".encode('utf-8')).hexdigest())
""" Pandoc command line call with piped in- and output """
cmd = (
'pandoc',
'-o-',
'--from=markdown+%s' % (
'+'.join([
'footnotes',
'pipe_tables',
'raw_html',
'definition_lists',
'backtick_code_blocks',
'fenced_code_attributes',
'shortcut_reference_links',
'lists_without_preceding_blankline',
'autolink_bare_uris',
])
),
'--to=html5',
'--quiet',
'--no-highlight'
)
p = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = p.communicate(input=text.encode())
if stderr:
logging.warning(
"Error during pandoc covert:\n\t%s\n\t%s",
cmd,
stderr
)
return stdout.decode('utf-8').strip()

View file

@ -1,26 +1,10 @@
arrow==0.12.1
bleach==2.1.3
certifi==2018.4.16
chardet==3.0.4
decorator==4.3.0
emoji==0.5.0
feedgen==0.7.0
html5lib==1.0.1
idna==2.7
Jinja2==2.10
langdetect==1.0.7
lxml==4.2.3
Markdown==2.6.11
markdown-urlize==0.2.0
MarkupSafe==1.0
Pygments==2.2.0
python-dateutil==2.7.3
python-frontmatter==0.4.2
PyYAML==3.13
requests==2.19.1
six==1.11.0
unicode-slugify==0.1.3
Unidecode==1.0.22
urllib3==1.23
Wand==0.4.4
webencodings==0.5.1

View file

@ -6,12 +6,28 @@ $redirects = array(
{% endfor %}
);
$redirects_re = array(
'^(?:sysadmin|it|linux-tech-coding|sysadmin-blog)\/?(page.*)?$' => 'category/article/',
'^(?:fotography|photoblog)\/?(page.*)?$' => '/category/photo/$1',
'^blog\/?(page.*)?$' => '/category/journal/',
'^blips\/?(page.*)?$' => '/category/note/$1',
'^r\/?(page.*)?$' => '/category/note/$1',
'^(?:linux-tech-coding|it|sysadmin-blog|sysadmin|fotography|blips|blog|photoblog|article|journal|photo|note|r)\/((?!page).*)' => '/$1',
);
$gone = array(
{% for gone in gones %}
"{{ gone }}" => true,
{% endfor %}
);
$gone_re = array(
'^cache/.*$',
'^files/.*$',
'^wp-content/.*$',
'^broadcast\/wp-ffpc.message$',
);
function redirect_to($uri) {
header('HTTP/1.1 301 Moved Permanently');
@ -78,11 +94,33 @@ $uri = str_replace('/feed/', '', $uri);
$uri = str_replace('/atom/', '', $uri);
$uri = trim($uri, '/');
if (isset($gone[$uri]))
foreach ($gone_re as $pattern) {
if (preg_match(sprintf('/%s/', $pattern), $uri)) {
gone($uri);
}
}
foreach ($redirects_re as $pattern => $target) {
$maybe = preg_match(sprintf('/%s/i', $pattern), $uri, $matches);
if ($maybe) {
$target = str_replace('$1', $matches[1], $target);
redirect_to($target);
}
}
/* "logic" */
if (isset($gone[$uri])) {
gone($uri);
elseif (isset($redirects[$uri]))
}
elseif (isset($redirects[$uri])) {
redirect_to($redirects[$uri]);
elseif (strstr($uri, '_'))
}
elseif (preg_match('/^\.well-known\/(host-meta|webfinger).*$/', $uri)) {
redirect_to("https://fed.brid.gy/{$uri}");
}
elseif (strstr($uri, '_')) {
maybe_redirect(str_replace('_', '-', $uri));
else
}
else {
notfound();
}

View file

@ -5,3 +5,11 @@
<link rel="canonical" href="{{ post.url }}" />
<link rel="license" href="https://creativecommons.org/licenses/4.0/{{ post.licence }}" />
{% endblock %}
{% block prism %}
{% if post.has_code %}
<style media="all">
{% include 'prism.css' %}
</style>
<script src="{{ site.url }}/prism.js"></script>
{% endif %}
{% endblock %}

View file

@ -33,14 +33,11 @@
}
localStorage.setItem("stylesheet", setto);
e.setAttribute("media", setto);
return false;
}
</script>
{% if post.has_code %}
<style media="all">
{% include 'prism.css' %}
</style>
<script src="{{ site.url }}/prism.js"></script>
{% endif %}
{% block prism %}
{% endblock %}
</head>
<body>
@ -90,7 +87,7 @@
<p class="contrast">
<a title="toggle site colour scheme" href="#"
onclick="toggleStylesheet(this)">
onclick="return toggleStylesheet(this)">
<svg class="icon" width="16" height="16">
<use xlink:href="#icon-contrast" />
</svg>
@ -169,9 +166,7 @@
<a class="fn p-name url u-url u-uid" href="{{ author.url }}">
{{ author.name }}
</a>
&lt;<a rel="me" class="u-email email" href="mailto:{{ author.email }}">
{{ author.email }}
</a>&gt;
&lt;<a rel="me" class="u-email email" href="mailto:{{ author.email }}">{{ author.email }}</a>&gt;
</p>
</dd>
@ -422,7 +417,7 @@
</nav>
<div class="webring">
<a href="https://xn--sr8hvo.ws/🇻🇮📢/previous"></a>
Member of <a href="https://xn--sr8hvo.ws">IndieWeb Webring</a> 🕸💍
Member of <a href="https://xn--sr8hvo.ws">IndieWeb Webring</a>
<a href="https://xn--sr8hvo.ws/🇻🇮📢/next"></a>
</div>
</div>

View file

@ -25,7 +25,7 @@ svg {
width: 16px;
height: 16px;
fill: currentColor;
vertical-align: text-top;
vertical-align: middle;
}
a {
@ -41,6 +41,10 @@ h1 {
font-size: 1.6em;
}
h1, h2 {
line-height: 1.2em;
}
h2, h3 {
margin-top: 2em;
}
@ -153,6 +157,7 @@ code, pre {
pre {
padding: 0.6em;
position: relative;
}
code {
@ -163,6 +168,11 @@ pre > code {
border: none;
}
pre> code::before {
content: attr(lang);
float: right;
}
table {
border-collapse: collapse;
border-spacing: 0;
@ -280,16 +290,11 @@ body > footer dd {
margin: -1.3em 0 0 4em;
}
body > footer nav {
display: flex;
justify-content: space-between;
}
.webring {
text-align: center;
}
.footnote a {
.footnotes a {
display: inline-block;
overflow: hidden;
white-space: nowrap;
@ -298,7 +303,7 @@ body > footer nav {
max-width: 80%;
}
.footnote-ref {
.footnote-back {
margin: 0 0 0 0.1em;
}
@ -306,6 +311,7 @@ body > footer nav {
.follow {
position: fixed;
right: 1em;
z-index: 100;
}
.contrast {
@ -334,4 +340,9 @@ body > footer nav {
body > header form {
margin: 0;
}
body > footer nav {
display: flex;
justify-content: space-between;
}
}