all repos — nasg @ 864bb544967142fefe6f2abcc66d21464d0a567b

re-adding index.php magic instead of nginx rules; given that I need to use php
for the search functionality, it's better to keep this contained instead of
adding one more black magic (eg. nginx rules)

Note: I've sort of exhausted client-side JS search options; the smallest index
I was able to build was still a few megabytes, way to large for my taste, hence
the SQLite FTS4 python + PHP solution. Yes, it's ugly, but it works, and PHP
is available nearly everywhere. Nearly.

Without something that can set headers properly, it's impossible to do real
HTTP 410 Gone, and even HTTP 301 or 302, so it's very likely I'm stuck with the
minimal PHP solution.
Peter Molnar hello@petermolnar.eu
Sun, 22 Jul 2018 14:52:32 +0100
commit

864bb544967142fefe6f2abcc66d21464d0a567b

parent

652d062d3179fbd46d20cc1b55b0e8e1b054b95f

3 files changed, 192 insertions(+), 67 deletions(-)

jump to
M .gitignore.gitignore

@@ -4,3 +4,4 @@ _scratch

.env keys.py +nasg.proj
M nasg.pynasg.py

@@ -65,6 +65,7 @@ 'urlize'

], ) + class MarkdownDoc(object): @property @cached()

@@ -160,13 +161,11 @@ """

def __init__(self, fpath): self.fpath = fpath - self.address, self.fext = os.path.splitext( - os.path.basename(self.fpath) - ) @property - def nginx(self): - return (self.address, 'return 410') + def source(self): + source, fext = os.path.splitext(os.path.basename(self.fpath)) + return source class Redirect(object):

@@ -176,7 +175,11 @@ """

def __init__(self, fpath): self.fpath = fpath - self.source, self.fext = os.path.splitext(os.path.basename(self.fpath)) + + @property + def source(self): + source, fext = os.path.splitext(os.path.basename(self.fpath)) + return source @property @cached()

@@ -184,20 +187,15 @@ def target(self):

target = '' with open(self.fpath, 'rt') as f: target = f.read().strip() - if not RE_HTTP.match(target): - target = "%s/%s" % (settings.site.get('url'), target) return target - @property - def nginx(self): - return (self.source, 'return 301 %s' % (self.target)) - class Singular(MarkdownDoc): """ A Singular object: a complete representation of a post, including all it's comments, files, images, etc """ + def __init__(self, fpath): self.fpath = fpath n = os.path.dirname(fpath)

@@ -445,11 +443,11 @@

@property def corpus(self): return "\n".join([ - self.title, - self.name, - self.summary, - self.content, - ]) + self.title, + self.name, + self.summary, + self.content, + ]) async def render(self): if self.exists:

@@ -810,20 +808,36 @@ w = asyncio.wait(self._tasks, return_when=asyncio.FIRST_EXCEPTION)

self._loop.run_until_complete(w) -class NginxConf(dict): - def __str__(self): - r = '' - for key in self: - r = "%slocation /%s { %s; }\n" % (r, key, self[key]) - return r +class IndexPHP(object): + def __init__(self): + self.gone = {} + self.redirect = {} + + def add_gone(self, uri): + self.gone[uri] = True + + def add_redirect(self, source, target): + if target in self.gone: + self.add_gone(source) + else: + if not RE_HTTP.match(target): + target = "%s/%s" % (settings.site.get('url'), target) + self.redirect[source] = target - def save(self): - fpath = os.path.join( + async def render(self): + target = os.path.join( settings.paths.get('build'), - '.nginx.conf' + 'index.php' ) - with open(fpath, 'wt') as f: - f.write(str(self)) + r = J2.get_template('Index.j2.php').render({ + 'post': {}, + 'site': settings.site, + 'gones': self.gone, + 'redirects': self.redirect + }) + with open(target, 'wt') as f: + logging.info("rendering to %s", target) + f.write(r) class Category(dict):

@@ -915,22 +929,22 @@

def ping_websub(self): return # TODO aiohttp? - ## ping pubsub - #r = requests.post( - #shared.site.get('websub').get('hub'), - #data={ - #'hub.mode': 'publish', - #'hub.url': flink - #} - #) - #logging.info(r.text) + # ping pubsub + # r = requests.post( + # shared.site.get('websub').get('hub'), + # data={ + # 'hub.mode': 'publish', + # 'hub.url': flink + # } + # ) + # logging.info(r.text) def render_feed(self): logging.info('rendering category "%s" ATOM feed', self.name) start = 0 end = int(settings.site.get('pagination')) - dirname = os.path.join(self.renderdir,'feed') + dirname = os.path.join(self.renderdir, 'feed') if not os.path.isdir(dirname): os.makedirs(dirname)

@@ -951,7 +965,7 @@ fg.logo('%s/favicon.png' % settings.site.get('url'))

fg.updated(arrow.get(self.mtime).to('utc').datetime) - for post in self.get_posts(start,end): + for post in self.get_posts(start, end): dt = arrow.get(post.get('pubtime')) fe = fg.add_entry() fe.id(post.get('url'))

@@ -968,18 +982,17 @@ post.get('licence').upper(),

settings.author.get('name'), dt.format('YYYY') )) - #if p.get('enclosure'): - #enclosure = p.get('enclosure') - #fe.enclosure( - #enclosure.get('url'), - #"%d" % enclosure.get('size'), - #enclosure.get('mime') - #) + # if p.get('enclosure'): + #enclosure = p.get('enclosure') + # fe.enclosure( + # enclosure.get('url'), + #"%d" % enclosure.get('size'), + # enclosure.get('mime') + # ) atom = os.path.join(dirname, 'index.xml') with open(atom, 'wb') as f: logging.info('writing file: %s', atom) f.write(fg.atom_str(pretty=True)) - def render_page(self, pagenum=1, pages=1): if self.display == 'flat':

@@ -1032,6 +1045,7 @@ page = page + 1

self.render_feed() self.ping_websub() + class Search(object): def __init__(self): self.fpath = os.path.join(

@@ -1058,20 +1072,43 @@ notindexed=url,

notindexed=mtime, tokenize=porter )''' - ) + ) def __exit__(self): self.db.commit() self.db.execute('PRAGMA auto_vacuum;') self.db.close() + def exists(self, name, mtime=0): + ret = False + maybe = self.db.execute(''' + SELECT + mtime + FROM + data + WHERE + name = ? + ''', (name,)).fetchone() + if maybe: + ret = int(maybe[0]) + return ret + def append(self, url, mtime, name, title, category, content): - # TODO: delete if mtime differs mtime = int(mtime) + exists = self.exists(name, mtime) + if (exists and exists < mtime): + self.db.execute(''' + DELETE + FROM + data + WHERE + name=?''', (name,)) self.db.execute(''' - INSERT OR IGNORE INTO data - (url, mtime, name, title, category, content) - VALUES (?,?,?,?,?,?); + INSERT OR IGNORE INTO + data + (url, mtime, name, title, category, content) + VALUES + (?,?,?,?,?,?); ''', ( url, mtime,

@@ -1082,6 +1119,12 @@ content

)) async def render(self): + target = os.path.join( + settings.paths.get('build'), + 'search.php' + ) + if os.path.exists(target): + return r = J2.get_template('Search.j2.php').render({ 'post': {}, 'site': settings.site,

@@ -1091,10 +1134,6 @@ 'licence': settings.licence,

'tips': settings.tips, 'labels': settings.labels }) - target = os.path.join( - settings.paths.get('build'), - 'search.php' - ) with open(target, 'wt') as f: logging.info("rendering to %s", target) f.write(r)

@@ -1103,19 +1142,17 @@

def make(): start = int(round(time.time() * 1000)) content = settings.paths.get('content') + worker = AsyncWorker() - nginxrules = NginxConf() - for e in glob.glob(os.path.join(content, '*', '*.lnk')): - post = Redirect(e) - location, rule = post.nginx - nginxrules[location] = rule + rules = IndexPHP() for e in glob.glob(os.path.join(content, '*', '*.ptr')): post = Gone(e) - location, rule = post.nginx - nginxrules[location] = rule - nginxrules.save() + rules.add_gone(post.source) + for e in glob.glob(os.path.join(content, '*', '*.lnk')): + post = Redirect(e) + rules.add_redirect(post.source, post.target) + worker.append(rules.render()) - worker = AsyncWorker() categories = {} categories['/'] = Category() sitemap = OrderedDict()

@@ -1166,10 +1203,9 @@ t = os.path.join(settings.paths.get('build'), 'sitemap.txt')

with open(t, 'wt') as f: f.write("\n".join(sorted(sitemap.keys()))) - - end = int(round(time.time() * 1000)) logging.info('process took %d ms' % (end - start)) + if __name__ == '__main__': make()
A templates/Index.j2.php

@@ -0,0 +1,88 @@

+<?php + +$redirects = array( +{% for from, to in redirects.items() %} + "{{ from }}" => "{{ to }}", +{% endfor %} +); + +$gone = array( +{% for gone in gones %} + "{{ gone }}" => true, +{% endfor %} +); + + +function redirect_to($uri) { + header('HTTP/1.1 301 Moved Permanently'); + if (preg_match("/^https?/", $uri)) + $target = $uri; + else + $target = '{{ site.url }}/'. trim($uri, '/') . '/'; + header("Location: ". $target); + exit; +} + +function gone($uri) { + header('HTTP/1.1 410 Gone'); + die('<!DOCTYPE html> +<html lang="en"> + <head> + <meta charset="utf-8"/> + <meta content="width=device-width,initial-scale=1,minimum-scale=1" name="viewport"/> + <title>Gone</title> + </head> + <body> +<h1><center>This content was deleted.</center></h1> +<hr> +<p><center>{{ site.domain }}</center></p> + </body> +</html>'); +} + +function notfound() { + header('HTTP/1.0 404 Not Found'); + die('<!DOCTYPE html> +<html lang="en"> + <head> + <meta charset="utf-8"/> + <meta content="width=device-width,initial-scale=1,minimum-scale=1" name="viewport"/> + <title>Not found</title> + </head> + <body> + +<h1><center>This was not found.</center></h1> +<h2><center>Please search for it instead.</center></h2> +<p> + <center> +<form action="/search.php" class="search-form" method="get" role="search"> + <label for="search">Search</label> + <input id="q" name="q" placeholder="search..." title="Search for:" type="search" value=""/> + <input type="submit" value="OK"/> +</form> + </center> +</p> + </body> +</html>'); +} + +function maybe_redirect($uri) { + if (file_exists("./$uri/index.html")) { + redirect_to($uri); + } +} + +$uri = filter_var($_SERVER['REQUEST_URI'], FILTER_SANITIZE_URL); +$uri = str_replace('../', '', $uri); +$uri = str_replace('/feed/', '', $uri); +$uri = str_replace('/atom/', '', $uri); +$uri = trim($uri, '/'); + +if (isset($gone[$uri])) + gone($uri); +elseif (isset($redirects[$uri])) + redirect_to($redirects[$uri]); +elseif (strstr($uri, '_')) + maybe_redirect(str_replace('_', '-', $uri)); +else + notfound();