2.0-alpha1: tags dropped, favs dropped, bookmarks dropped, reposts dropped, better async rendering; TODO comments, websub pings, webmentions
This commit is contained in:
parent
112448cf92
commit
4a699ef9f5
7 changed files with 1495 additions and 2234 deletions
168
README.md
168
README.md
|
@ -1,8 +1,166 @@
|
|||
# NASG: Not Another Statig Generator...
|
||||
# NASG (Not Another Statig Generator)
|
||||
|
||||
So I ended up writing my static generator and this is (most) of the code for it.
|
||||
This is a tiny static site generator, written in Python, to scratch my own itches.
|
||||
It is most probably not suitable for anyone else.
|
||||
|
||||
Don't expect anything fancy and please be aware that my Python Fu has much to learn.
|
||||
## Why not [insert static generator here]?
|
||||
|
||||
I've written about the generic ideas and approaches here in my
|
||||
[Going Static](https://petermolnar.net/going-static/) entry.
|
||||
- DRY -Don't Repeat Yourself - is good, so instead of sidefiles for images, I'm using XMP metadata, which most of the ones availabe don't handle well;
|
||||
- writing a proper plugin to existing generators - Pelican, Nicola, etc - might have taken longer and I wanted to extend my Python knowledge
|
||||
- I wanted to use the best available utilities for some tasks, like `Pandoc` and `exiftool` instead of Python libraries trying to achive the same
|
||||
- I needed to handle webmentions and comments
|
||||
|
||||
Don't expect anything fancy: my Python Fu has much to learn.
|
||||
|
||||
## How content is organized
|
||||
|
||||
The directory structure of the "source" is something like this:
|
||||
```
|
||||
├── content
|
||||
│ ├── category1 (containing YAML + MD files)
|
||||
│ ├── category2 (containing YAML + MD files)
|
||||
│ ├── photo (containing jpg files)
|
||||
│ ├── _category_excluded_from_listing_1 (containing YAML + MD files)
|
||||
|
||||
├── files
|
||||
│ ├── image (my own pictures)
|
||||
│ ├── photo -> ../content/photo
|
||||
│ └── pic (random images)
|
||||
├── nasg
|
||||
│ ├── archive.py
|
||||
│ ├── config.ini
|
||||
│ ├── db.py
|
||||
│ ├── LICENSE
|
||||
│ ├── nasg.py
|
||||
│ ├── README.md
|
||||
│ ├── requirements.txt
|
||||
│ ├── router.py
|
||||
│ ├── shared.py
|
||||
│ └── templates
|
||||
├── static
|
||||
│ ├── favicon.ico
|
||||
│ ├── favicon.png
|
||||
│ └── pgp.asc
|
||||
└── var
|
||||
├── gone.tsv
|
||||
├── redirects.tsv
|
||||
├── s.sqlite
|
||||
├── tokens.json
|
||||
└── webmention.sqlite
|
||||
```
|
||||
|
||||
Content files can be in either YAML and Markdown, with `.md` extension, or JPG with metadata, with `.jpg` extension.
|
||||
|
||||
Inline images in the content are checked against all subdirectories in `files` ; they get their EXIF read and displayed as well if they match the regex in the configuration for the Artist and/or Copyright EXIF fields.
|
||||
|
||||
`gone.tsv` is a simple list of URIs that should return a `410 Gone` message while `redirect.tsv` is a tab separated file of `from to` entries that should be `301` redirected. These go into a magic.php file, so if the host supports executing PHP, it will take care of this.
|
||||
|
||||
## Output
|
||||
|
||||
`nasg.py` generates a `build` directory which will have an directory per entry, with an `index.html`, so urls can be `https://domain.com/filename/`.
|
||||
|
||||
Categories are rendered into `category/category_name`. Pagination is under `category/category_name/page/X`. They include a feed as well, `category/category_name/feed`, in form if an `index.atom` ATOM feed.
|
||||
|
||||
## Webserver configuration
|
||||
|
||||
A minimal nginx configuration for the virtualhost:
|
||||
```
|
||||
# --- Virtual Host ---
|
||||
upstream {{ domain }} {
|
||||
server unix:/var/run/php/{{ domain }}.sock;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
server_name .{{ domain }};
|
||||
rewrite ^ https://$server_name$request_uri redirect;
|
||||
access_log /dev/null;
|
||||
error_log /dev/null;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 443 ssl http2;
|
||||
server_name .{{ domain }};
|
||||
ssl_certificate /etc/letsencrypt/live/{{ domain }}/fullchain.pem;
|
||||
ssl_certificate_key /etc/letsencrypt/live/{{ domain }}/privkey.pem;
|
||||
ssl_dhparam dh.pem;
|
||||
add_header X-Frame-Options "SAMEORIGIN";
|
||||
add_header X-Content-Type-Options "nosniff";
|
||||
add_header X-XSS-Protection "1; mode=block";
|
||||
add_header Strict-Transport-Security "max-age=31536000; includeSubdomains;";
|
||||
|
||||
root /[path to root]/{{ domain }};
|
||||
|
||||
location = /favicon.ico {
|
||||
log_not_found off;
|
||||
access_log off;
|
||||
}
|
||||
|
||||
location = /robots.txt {
|
||||
log_not_found off;
|
||||
access_log off;
|
||||
}
|
||||
|
||||
location ~ ^(?<script_name>.+?\.php)(?<path_info>.*)$ {
|
||||
try_files $uri $script_name =404;
|
||||
fastcgi_param SCRIPT_FILENAME $document_root$script_name;
|
||||
fastcgi_param SCRIPT_NAME $script_name;
|
||||
fastcgi_param PATH_INFO $path_info;
|
||||
fastcgi_param PATH_TRANSLATED $document_root$path_info;
|
||||
fastcgi_param QUERY_STRING $query_string;
|
||||
fastcgi_param REQUEST_METHOD $request_method;
|
||||
fastcgi_param CONTENT_TYPE $content_type;
|
||||
fastcgi_param CONTENT_LENGTH $content_length;
|
||||
fastcgi_param SCRIPT_NAME $script_name;
|
||||
fastcgi_param REQUEST_URI $request_uri;
|
||||
fastcgi_param DOCUMENT_URI $document_uri;
|
||||
fastcgi_param DOCUMENT_ROOT $document_root;
|
||||
fastcgi_param SERVER_PROTOCOL $server_protocol;
|
||||
fastcgi_param GATEWAY_INTERFACE CGI/1.1;
|
||||
fastcgi_param SERVER_SOFTWARE nginx;
|
||||
fastcgi_param REMOTE_ADDR $remote_addr;
|
||||
fastcgi_param REMOTE_PORT $remote_port;
|
||||
fastcgi_param SERVER_ADDR $server_addr;
|
||||
fastcgi_param SERVER_PORT $server_port;
|
||||
fastcgi_param SERVER_NAME $server_name;
|
||||
fastcgi_param HTTP_PROXY "";
|
||||
fastcgi_param HTTPS $https if_not_empty;
|
||||
fastcgi_param SSL_PROTOCOL $ssl_protocol if_not_empty;
|
||||
fastcgi_param SSL_CIPHER $ssl_cipher if_not_empty;
|
||||
fastcgi_param SSL_SESSION_ID $ssl_session_id if_not_empty;
|
||||
fastcgi_param SSL_CLIENT_VERIFY $ssl_client_verify if_not_empty;
|
||||
fastcgi_param REDIRECT_STATUS 200;
|
||||
fastcgi_index index.php;
|
||||
fastcgi_connect_timeout 10;
|
||||
fastcgi_send_timeout 360;
|
||||
fastcgi_read_timeout 3600;
|
||||
fastcgi_buffer_size 512k;
|
||||
fastcgi_buffers 512 512k;
|
||||
fastcgi_keep_conn on;
|
||||
fastcgi_intercept_errors on;
|
||||
fastcgi_split_path_info ^(?<script_name>.+?\.php)(?<path_info>.*)$;
|
||||
fastcgi_pass {{ domain }};
|
||||
}
|
||||
|
||||
location / {
|
||||
try_files $uri $uri/ $uri.html $uri/index.html $uri/index.xml $uri/index.atom index.php @rewrites;
|
||||
}
|
||||
|
||||
location @rewrites {
|
||||
rewrite ^ /magic.php?$args last;
|
||||
}
|
||||
|
||||
location ~* \.(css|js|eot|woff|ttf|woff2)$ {
|
||||
expires 1d;
|
||||
add_header Cache-Control "public, must-revalidate, proxy-revalidate";
|
||||
add_header "Vary" "Accept-Encoding";
|
||||
}
|
||||
|
||||
location ~* \.(png|ico|gif|svg|jpg|jpeg|webp|avi|mpg|mpeg|mp4|mp3)$ {
|
||||
expires 7d;
|
||||
add_header Cache-Control "public, must-revalidate, proxy-revalidate";
|
||||
add_header "Vary" "Accept-Encoding";
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
|
71
archive.py
71
archive.py
|
@ -5,14 +5,16 @@ import glob
|
|||
import logging
|
||||
import shutil
|
||||
import subprocess
|
||||
import imghdr
|
||||
import arrow
|
||||
|
||||
from pprint import pprint
|
||||
|
||||
from requests_oauthlib import OAuth1Session, oauth1_session, OAuth2Session, oauth2_session
|
||||
from oauthlib.oauth2 import BackendApplicationClient
|
||||
|
||||
import db
|
||||
import shared
|
||||
|
||||
|
||||
class Favs(object):
|
||||
def __init__(self, confgroup):
|
||||
self.confgroup = confgroup
|
||||
|
@ -101,6 +103,7 @@ class FlickrFavs(Favs):
|
|||
fav = FlickrFav(photo)
|
||||
if not fav.exists:
|
||||
fav.run()
|
||||
#fav.fix_extension()
|
||||
|
||||
class FivehpxFavs(Favs):
|
||||
def __init__(self):
|
||||
|
@ -179,6 +182,7 @@ class FivehpxFavs(Favs):
|
|||
fav = FivehpxFav(photo)
|
||||
if not fav.exists:
|
||||
fav.run()
|
||||
#fav.fix_extension()
|
||||
|
||||
|
||||
class TumblrFavs(Favs):
|
||||
|
@ -242,7 +246,7 @@ class DAFavs(Favs):
|
|||
'https://www.deviantart.com/api/v1/oauth2/collections/folders',
|
||||
params={
|
||||
'username': self.username,
|
||||
'calculate_size': 'false',
|
||||
'calculate_size': 'true',
|
||||
'ext_preload': 'false',
|
||||
'mature_content': 'true'
|
||||
}
|
||||
|
@ -304,29 +308,29 @@ class DAFavs(Favs):
|
|||
has_more = self.has_more(js.get('has_more'))
|
||||
offset = js.get('next_offset')
|
||||
while True == has_more:
|
||||
logging.info('iterating over DA results with offset %d', offset)
|
||||
#logging.info('iterating over DA results with offset %d', offset)
|
||||
paged = self.getpaged(offset)
|
||||
new = paged.get('results', [])
|
||||
if not len(new):
|
||||
#logging.error('empty results from deviantART, breaking loop')
|
||||
break
|
||||
favs = favs + new
|
||||
favs = [*favs, *new]
|
||||
has_more = self.has_more(paged.get('has_more'))
|
||||
if not has_more:
|
||||
break
|
||||
n = int(paged.get('next_offset'))
|
||||
if not n:
|
||||
break
|
||||
offset = offset + n
|
||||
offset = n
|
||||
|
||||
self.favs = favs
|
||||
for fav in self.favs:
|
||||
f = DAFav(fav)
|
||||
if f.exists:
|
||||
continue
|
||||
if not f.exists:
|
||||
f.fav.update({'meta': self.getsinglemeta(fav.get('deviationid'))})
|
||||
f.run()
|
||||
#f.fix_extension()
|
||||
|
||||
f.fav.update({'meta': self.getsinglemeta(fav.get('deviationid'))})
|
||||
f.run()
|
||||
|
||||
class ImgFav(object):
|
||||
def __init__(self):
|
||||
|
@ -349,7 +353,19 @@ class ImgFav(object):
|
|||
|
||||
@property
|
||||
def exists(self):
|
||||
return os.path.exists(self.target)
|
||||
maybe = glob.glob(self.target.replace('.jpg', '.*'))
|
||||
if len(maybe):
|
||||
return True
|
||||
return False
|
||||
|
||||
def fix_extension(self):
|
||||
# identify file format
|
||||
what = imghdr.what(self.target)
|
||||
# rename file
|
||||
new = self.target.replace('.jpg', '.%s' % what)
|
||||
if new != self.target:
|
||||
shutil.move(self.target, new)
|
||||
self.target = new
|
||||
|
||||
def pull_image(self):
|
||||
logging.info("pulling image %s to %s", self.imgurl, self.target)
|
||||
|
@ -359,8 +375,11 @@ class ImgFav(object):
|
|||
r.raw.decode_content = True
|
||||
shutil.copyfileobj(r.raw, f)
|
||||
|
||||
|
||||
def write_exif(self):
|
||||
what = imghdr.what(self.target)
|
||||
if 'jpg' != what or 'png' != what:
|
||||
return
|
||||
|
||||
logging.info('populating EXIF data of %s' % self.target)
|
||||
tags = list(set(self.meta.get('tags',[])))
|
||||
dt = self.meta.get('dt').to('utc')
|
||||
|
@ -387,7 +406,7 @@ class ImgFav(object):
|
|||
params = [
|
||||
'exiftool',
|
||||
'-overwrite_original',
|
||||
'-EXIF:Artist=%s' % author_name[:64],
|
||||
#'-EXIF:Artist=%s' % author_name[:64],
|
||||
'-XMP:Copyright=Copyright %s %s (%s)' % (
|
||||
dt.format('YYYY'),
|
||||
author_name,
|
||||
|
@ -501,6 +520,7 @@ class FlickrFav(ImgFav):
|
|||
self.photo.get('description', {}).get('_content', '')
|
||||
)
|
||||
|
||||
self.fix_extension()
|
||||
self.write_exif()
|
||||
|
||||
class FivehpxFav(ImgFav):
|
||||
|
@ -546,12 +566,14 @@ class FivehpxFav(ImgFav):
|
|||
}
|
||||
c = "%s" % self.photo.get('description', '')
|
||||
self.content = shared.Pandoc('plain').convert(c)
|
||||
self.fix_extension()
|
||||
self.write_exif()
|
||||
|
||||
class DAFav(ImgFav):
|
||||
def __init__(self, fav):
|
||||
self.fav = fav
|
||||
self.deviationid = fav.get('deviationid')
|
||||
#logging.info('working on %s', self.deviationid)
|
||||
self.url = fav.get('url')
|
||||
self.title = fav.get('title', False) or self.deviationid
|
||||
self.author = self.fav.get('author').get('username')
|
||||
|
@ -562,9 +584,21 @@ class DAFav(ImgFav):
|
|||
shared.slugfname(self.author)
|
||||
)
|
||||
)
|
||||
|
||||
self.imgurl = None
|
||||
if 'content' in fav:
|
||||
if 'src' in fav['content']:
|
||||
self.imgurl = fav.get('content').get('src')
|
||||
elif 'preview' in fav:
|
||||
if 'src' in fav['preview']:
|
||||
self.imgurl = fav.get('preview').get('src')
|
||||
self.imgurl = fav.get('content', {}).get('src')
|
||||
|
||||
def run(self):
|
||||
if not self.imgurl:
|
||||
logging.error('imgurl is empty for deviantart %s', self.deviationid)
|
||||
return
|
||||
|
||||
self.pull_image()
|
||||
|
||||
self.meta = {
|
||||
|
@ -583,6 +617,7 @@ class DAFav(ImgFav):
|
|||
}
|
||||
c = "%s" % self.fav.get('meta', {}).get('description', '')
|
||||
self.content = shared.Pandoc('plain').convert(c)
|
||||
self.fix_extension()
|
||||
self.write_exif()
|
||||
|
||||
|
||||
|
@ -600,7 +635,10 @@ class TumblrFav(object):
|
|||
|
||||
@property
|
||||
def exists(self):
|
||||
return os.path.exists(self.target.replace('.jpg', '_0.jpg'))
|
||||
maybe = glob.glob(self.target.replace('.jpg', '_0.*'))
|
||||
if len(maybe):
|
||||
return True
|
||||
return False
|
||||
|
||||
def run(self):
|
||||
content = "%s" % self.like.get('caption', '')
|
||||
|
@ -635,6 +673,7 @@ class TumblrFav(object):
|
|||
img.content = content
|
||||
img.meta = meta
|
||||
img.pull_image()
|
||||
img.fix_extension()
|
||||
img.write_exif()
|
||||
icntr = icntr + 1
|
||||
|
||||
|
@ -681,7 +720,7 @@ class Oauth1Flow(object):
|
|||
self.service = service
|
||||
self.key = shared.config.get("api_%s" % service, 'api_key')
|
||||
self.secret = shared.config.get("api_%s" % service, 'api_secret')
|
||||
self.tokendb = shared.TokenDB()
|
||||
self.tokendb = db.TokenDB()
|
||||
self.t = self.tokendb.get_service(self.service)
|
||||
self.oauth_init()
|
||||
|
||||
|
@ -796,7 +835,7 @@ class TumblrOauth(Oauth1Flow):
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=10)
|
||||
logging.basicConfig(level=20)
|
||||
|
||||
flickr = FlickrFavs()
|
||||
flickr.run()
|
||||
|
|
234
db.py
Normal file
234
db.py
Normal file
|
@ -0,0 +1,234 @@
|
|||
import os
|
||||
import json
|
||||
import sqlite3
|
||||
import glob
|
||||
import shared
|
||||
|
||||
# TODO sqlite3 cache instead of filesystem ?
|
||||
|
||||
class TokenDB(object):
|
||||
def __init__(self, uuid='tokens'):
|
||||
self.db = shared.config.get('var', 'tokendb')
|
||||
self.tokens = {}
|
||||
self.refresh()
|
||||
|
||||
def refresh(self):
|
||||
self.tokens = {}
|
||||
if os.path.isfile(self.db):
|
||||
with open(self.db, 'rt') as f:
|
||||
self.tokens = json.loads(f.read())
|
||||
|
||||
def save(self):
|
||||
with open(self.db, 'wt') as f:
|
||||
f.write(json.dumps(
|
||||
self.tokens, indent=4, sort_keys=True
|
||||
))
|
||||
|
||||
def get_token(self, token):
|
||||
return self.tokens.get(token, None)
|
||||
|
||||
def get_service(self, service):
|
||||
token = self.tokens.get(service, None)
|
||||
return token
|
||||
|
||||
def set_service(self, service, tokenid):
|
||||
self.tokens.update({
|
||||
service: tokenid
|
||||
})
|
||||
self.save()
|
||||
|
||||
def update_token(self,
|
||||
token,
|
||||
oauth_token_secret=None,
|
||||
access_token=None,
|
||||
access_token_secret=None,
|
||||
verifier=None):
|
||||
|
||||
t = self.tokens.get(token, {})
|
||||
if oauth_token_secret:
|
||||
t.update({
|
||||
'oauth_token_secret': oauth_token_secret
|
||||
})
|
||||
if access_token:
|
||||
t.update({
|
||||
'access_token': access_token
|
||||
})
|
||||
if access_token_secret:
|
||||
t.update({
|
||||
'access_token_secret': access_token_secret
|
||||
})
|
||||
if verifier:
|
||||
t.update({
|
||||
'verifier': verifier
|
||||
})
|
||||
|
||||
self.tokens.update({
|
||||
token: t
|
||||
})
|
||||
self.save()
|
||||
|
||||
def clear(self):
|
||||
self.tokens = {}
|
||||
self.save()
|
||||
|
||||
def clear_service(self, service):
|
||||
t = self.tokens.get(service)
|
||||
if t:
|
||||
del(self.tokens[t])
|
||||
del(self.tokens[service])
|
||||
self.save()
|
||||
|
||||
class SearchDB(object):
|
||||
tmplfile = 'Search.html'
|
||||
|
||||
def __init__(self):
|
||||
self.db = sqlite3.connect(
|
||||
"%s" % shared.config.get('var', 'searchdb')
|
||||
)
|
||||
|
||||
cursor = self.db.cursor()
|
||||
cursor.execute('''CREATE VIRTUAL TABLE IF NOT EXISTS data USING FTS5(
|
||||
id,
|
||||
corpus,
|
||||
mtime,
|
||||
url,
|
||||
category,
|
||||
title
|
||||
)''')
|
||||
self.db.commit()
|
||||
|
||||
def __exit__(self):
|
||||
self.finish()
|
||||
|
||||
def finish(self):
|
||||
self.db.close()
|
||||
|
||||
def append(self, id, corpus, mtime, url, category, title):
|
||||
mtime = int(mtime)
|
||||
cursor = self.db.cursor()
|
||||
cursor.execute('''UPDATE data SET corpus=?, mtime=?, url=?, category=?, title=? WHERE id=?;''', (
|
||||
corpus,
|
||||
mtime,
|
||||
url,
|
||||
category,
|
||||
title,
|
||||
id
|
||||
))
|
||||
cursor.execute('''INSERT OR IGNORE INTO data (id, corpus, mtime, url, category, title) VALUES (?,?,?,?,?,?);''', (
|
||||
id,
|
||||
corpus,
|
||||
mtime,
|
||||
url,
|
||||
category,
|
||||
title
|
||||
))
|
||||
self.db.commit()
|
||||
|
||||
def is_uptodate(self, fname, mtime):
|
||||
ret = {}
|
||||
cursor = self.db.cursor()
|
||||
cursor.execute('''SELECT mtime
|
||||
FROM data
|
||||
WHERE id = ? AND mtime = ?''',
|
||||
(fname,mtime)
|
||||
)
|
||||
rows = cursor.fetchall()
|
||||
if len(rows):
|
||||
return True
|
||||
return False
|
||||
|
||||
def search_by_query(self, query):
|
||||
ret = {}
|
||||
cursor = self.db.cursor()
|
||||
cursor.execute('''SELECT
|
||||
id, category, url, title, highlight(data, 0, '<strong>', '</strong>') corpus
|
||||
FROM data
|
||||
WHERE data MATCH ?
|
||||
ORDER BY category, rank;''', (query,))
|
||||
rows = cursor.fetchall()
|
||||
for r in rows:
|
||||
r = {
|
||||
'id': r[0],
|
||||
'category': r[1],
|
||||
'url': r[2],
|
||||
'title': r[3],
|
||||
'txt': r[4],
|
||||
}
|
||||
|
||||
category = r.get('category')
|
||||
if category not in ret:
|
||||
ret.update({category: {}})
|
||||
|
||||
|
||||
maybe_fpath = os.path.join(
|
||||
shared.config.get('dirs', 'content'),
|
||||
category,
|
||||
"%s.*" % r.get('id')
|
||||
)
|
||||
#fpath = glob.glob(maybe_fpath).pop()
|
||||
ret.get(category).update({
|
||||
r.get('id'): {
|
||||
#'fpath': fpath,
|
||||
'url': r.get('url'),
|
||||
'title': r.get('title'),
|
||||
'txt': r.get('txt')
|
||||
}
|
||||
})
|
||||
return ret
|
||||
|
||||
|
||||
def cli(self, query):
|
||||
results = self.search_by_query(query)
|
||||
for c, items in sorted(results.items()):
|
||||
print("%s:" % c)
|
||||
for fname, data in sorted(items.items()):
|
||||
print(" %s" % data.get('fpath'))
|
||||
print(" %s" % data.get('url'))
|
||||
print("")
|
||||
|
||||
def html(self, query):
|
||||
tmplvars = {
|
||||
'results': self.search_by_query(query),
|
||||
'term': query
|
||||
}
|
||||
return shared.j2.get_template(self.tmplfile).render(tmplvars)
|
||||
|
||||
|
||||
class WebmentionQueue(object):
|
||||
def __init__(self):
|
||||
self.db = sqlite3.connect(
|
||||
"%s" % shared.config.get('var', 'webmentiondb')
|
||||
)
|
||||
|
||||
cursor = self.db.cursor()
|
||||
cursor.execute('''CREATE TABLE IF NOT EXISTS `archive` (
|
||||
`id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
|
||||
`received` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
`processed` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
`source` TEXT NOT NULL,
|
||||
`target` TEXT NOT NULL
|
||||
);''');
|
||||
|
||||
cursor.execute('''CREATE TABLE IF NOT EXISTS `queue` (
|
||||
`id` INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
|
||||
`timestamp` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
`source` TEXT NOT NULL,
|
||||
`target` TEXT NOT NULL
|
||||
);''');
|
||||
self.db.commit()
|
||||
|
||||
def __exit__(self):
|
||||
self.finish()
|
||||
|
||||
def finish(self):
|
||||
self.db.close()
|
||||
|
||||
def queue(self, source, target):
|
||||
cursor = self.db.cursor()
|
||||
cursor.execute(
|
||||
'''INSERT INTO queue (source,target) VALUES (?,?);''', (
|
||||
source,
|
||||
target
|
||||
)
|
||||
)
|
||||
self.db.commit()
|
|
@ -1,30 +1,8 @@
|
|||
aiofiles==0.3.1
|
||||
appdirs==1.4.3
|
||||
arrow==0.10.0
|
||||
breadability==0.1.20
|
||||
chardet==3.0.3
|
||||
decorator==4.0.11
|
||||
docopt==0.6.2
|
||||
httptools==0.0.9
|
||||
Jinja2==2.9.6
|
||||
langdetect==1.0.7
|
||||
lxml==3.7.3
|
||||
MarkupSafe==1.0
|
||||
packaging==16.8
|
||||
pyparsing==2.2.0
|
||||
python-dateutil==2.6.0
|
||||
python-frontmatter==0.4.2
|
||||
python-magic==0.4.13
|
||||
PyYAML==3.12
|
||||
requests==2.14.2
|
||||
sanic==0.5.4
|
||||
similar-text==0.2.0
|
||||
six==1.10.0
|
||||
ujson==1.35
|
||||
requests==2.12.4
|
||||
requests-oauthlib==0.8.0
|
||||
sanic==0.6.0
|
||||
unicode-slugify==0.1.3
|
||||
Unidecode==0.4.20
|
||||
uvloop==0.8.0
|
||||
validators==0.11.3
|
||||
Wand==0.4.4
|
||||
websockets==3.3
|
||||
Whoosh==2.7.4
|
||||
|
|
86
router.py
Normal file
86
router.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
#import asyncio
|
||||
#import uvloop
|
||||
from sanic import Sanic
|
||||
import sanic.response
|
||||
import logging
|
||||
import db
|
||||
import shared
|
||||
import validators
|
||||
import urllib.parse
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging_format = "[%(asctime)s] %(process)d-%(levelname)s "
|
||||
logging_format += "%(module)s::%(funcName)s():l%(lineno)d: "
|
||||
logging_format += "%(message)s"
|
||||
|
||||
logging.basicConfig(
|
||||
format=logging_format,
|
||||
level=logging.DEBUG
|
||||
)
|
||||
log = logging.getLogger()
|
||||
|
||||
# log_config=None prevents creation of access_log and error_log files
|
||||
# since I'm running this from systemctl it already goes into syslog
|
||||
app = Sanic('router', log_config=None)
|
||||
# this is ok to be read-only
|
||||
sdb = db.SearchDB()
|
||||
|
||||
|
||||
@app.route("/oauth1", methods=["GET"])
|
||||
async def oauth1(request):
|
||||
token = request.args.get('oauth_token')
|
||||
verifier = request.args.get('oauth_verifier')
|
||||
tokendb = shared.TokenDB()
|
||||
tokendb.update_token(
|
||||
token,
|
||||
verifier=verifier
|
||||
)
|
||||
return sanic.response.text("OK",status=200)
|
||||
|
||||
|
||||
@app.route("/search", methods=["GET"])
|
||||
async def search(request):
|
||||
query = request.args.get('s')
|
||||
r = sdb.html(query)
|
||||
response = sanic.response.html(r, status=200)
|
||||
return response
|
||||
|
||||
|
||||
@app.route("/micropub", methods=["POST","GET"])
|
||||
async def micropub(request):
|
||||
return sanic.response.text("Not Implemented", status=501)
|
||||
|
||||
|
||||
@app.route("/webmention", methods=["POST"])
|
||||
async def webmention(request):
|
||||
source = request.form.get('source')
|
||||
target = request.form.get('target')
|
||||
|
||||
# validate urls
|
||||
if not validators.url(source):
|
||||
return sanic.response.text('Invalide source url', status=400)
|
||||
if not validators.url(target):
|
||||
return sanic.response.text('Invalide target url', status=400)
|
||||
|
||||
# check if our site is actually the target for the webmention
|
||||
_target = urllib.parse.urlparse(target)
|
||||
if _target.hostname not in shared.config.get('site', 'domains'):
|
||||
return sanic.response.text('target domain is not me', status=400)
|
||||
|
||||
# ignore selfpings
|
||||
_source = urllib.parse.urlparse(source)
|
||||
if _source.hostname in shared.config.get('site', 'domains'):
|
||||
return sanic.response.text('selfpings are not allowed', status=400)
|
||||
|
||||
# it is unfortunate that I need to init this every time, but
|
||||
# otherwise it'll become read-only for reasons I'm yet to grasp
|
||||
# the actual parsing will be done at site generation time
|
||||
wdb = db.WebmentionQueue()
|
||||
wdb.queue(source,target)
|
||||
response = sanic.response.text("Accepted", status=202)
|
||||
return response
|
||||
|
||||
|
||||
app.run(host="127.0.0.1",port=8008, log_config=None)
|
416
shared.py
416
shared.py
|
@ -5,131 +5,10 @@ import glob
|
|||
import logging
|
||||
import subprocess
|
||||
import json
|
||||
import requests
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
import sqlite3
|
||||
|
||||
from whoosh import fields
|
||||
from whoosh import analysis
|
||||
from slugify import slugify
|
||||
|
||||
LLEVEL = {
|
||||
'critical': 50,
|
||||
'error': 40,
|
||||
'warning': 30,
|
||||
'info': 20,
|
||||
'debug': 10
|
||||
}
|
||||
|
||||
|
||||
def __expandconfig(config):
|
||||
""" add the dirs to the config automatically """
|
||||
basepath = os.path.expanduser(config.get('common','base'))
|
||||
config.set('common', 'basedir', basepath)
|
||||
for section in ['source', 'target']:
|
||||
for option in config.options(section):
|
||||
opt = config.get(section, option)
|
||||
config.set(section, "%sdir" % option, os.path.join(basepath,opt))
|
||||
config.set('target', 'filesdir', os.path.join(
|
||||
config.get('target', 'builddir'),
|
||||
config.get('source', 'files'),
|
||||
))
|
||||
config.set('target', 'commentsdir', os.path.join(
|
||||
config.get('target', 'builddir'),
|
||||
config.get('site', 'commentspath'),
|
||||
))
|
||||
return config
|
||||
|
||||
|
||||
def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"):
|
||||
""" Used to create short, lowercase slug for a number (an epoch) passed """
|
||||
num = int(num)
|
||||
return ((num == 0) and numerals[0]) or (
|
||||
baseN(
|
||||
num // b,
|
||||
b,
|
||||
numerals
|
||||
).lstrip(numerals[0]) + numerals[num % b]
|
||||
)
|
||||
|
||||
def slugfname(url):
|
||||
return "%s" % slugify(
|
||||
re.sub(r"^https?://(?:www)?", "", url),
|
||||
only_ascii=True,
|
||||
lower=True
|
||||
)[:200]
|
||||
|
||||
ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ'
|
||||
STRFISO = '%Y-%m-%dT%H:%M:%S%z'
|
||||
|
||||
URLREGEX = re.compile(
|
||||
r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+'
|
||||
r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*'
|
||||
)
|
||||
|
||||
EXIFREXEG = re.compile(
|
||||
r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+'
|
||||
r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$'
|
||||
)
|
||||
|
||||
MDIMGREGEX = re.compile(
|
||||
r'(!\[(.*)\]\((?:\/(?:files|cache)'
|
||||
r'(?:\/[0-9]{4}\/[0-9]{2})?\/(.*\.(?:jpe?g|png|gif)))'
|
||||
r'(?:\s+[\'\"]?(.*?)[\'\"]?)?\)(?:\{(.*?)\})?)'
|
||||
, re.IGNORECASE)
|
||||
|
||||
schema = fields.Schema(
|
||||
url=fields.ID(
|
||||
stored=True,
|
||||
unique=True
|
||||
),
|
||||
category=fields.TEXT(
|
||||
stored=True,
|
||||
),
|
||||
date=fields.DATETIME(
|
||||
stored=True,
|
||||
sortable=True
|
||||
),
|
||||
title=fields.TEXT(
|
||||
stored=True,
|
||||
analyzer=analysis.FancyAnalyzer()
|
||||
),
|
||||
weight=fields.NUMERIC(
|
||||
sortable=True
|
||||
),
|
||||
img=fields.TEXT(
|
||||
stored=True
|
||||
),
|
||||
content=fields.TEXT(
|
||||
stored=True,
|
||||
analyzer=analysis.FancyAnalyzer()
|
||||
),
|
||||
fuzzy=fields.NGRAMWORDS(
|
||||
tokenizer=analysis.NgramTokenizer(4)
|
||||
),
|
||||
mtime=fields.NUMERIC(
|
||||
stored=True
|
||||
)
|
||||
#slug=fields.NGRAMWORDS(
|
||||
#tokenizer=analysis.NgramTokenizer(4)
|
||||
#),
|
||||
#reactions=fields.NGRAMWORDS(
|
||||
#tokenizer=analysis.NgramTokenizer(4)
|
||||
#),
|
||||
#tags=fields.TEXT(
|
||||
#stored=False,
|
||||
#analyzer=analysis.KeywordAnalyzer(
|
||||
#lowercase=True,
|
||||
#commas=True
|
||||
#),
|
||||
#),
|
||||
)
|
||||
|
||||
config = configparser.ConfigParser(
|
||||
interpolation=configparser.ExtendedInterpolation(),
|
||||
allow_no_value=True
|
||||
)
|
||||
config.read('config.ini')
|
||||
config = __expandconfig(config)
|
||||
import jinja2
|
||||
|
||||
class CMDLine(object):
|
||||
def __init__(self, executable):
|
||||
|
@ -138,7 +17,6 @@ class CMDLine(object):
|
|||
raise OSError('No %s found in PATH!' % executable)
|
||||
return
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _which(name):
|
||||
for d in os.environ['PATH'].split(':'):
|
||||
|
@ -148,33 +26,6 @@ class CMDLine(object):
|
|||
return None
|
||||
|
||||
|
||||
def __enter__(self):
|
||||
self.process = subprocess.Popen(
|
||||
[self.executable, "-stay_open", "True", "-@", "-"],
|
||||
universal_newlines=True,
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE
|
||||
)
|
||||
return self
|
||||
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.process.stdin.write("-stay_open\nFalse\n")
|
||||
self.process.stdin.flush()
|
||||
|
||||
|
||||
def execute(self, *args):
|
||||
args = args + ("-execute\n",)
|
||||
self.process.stdin.write(str.join("\n", args))
|
||||
self.process.stdin.flush()
|
||||
output = ""
|
||||
fd = self.process.stdout.fileno()
|
||||
while not output.endswith(self.sentinel):
|
||||
output += os.read(fd, 4096).decode('utf-8', errors='ignore')
|
||||
return output[:-len(self.sentinel)]
|
||||
|
||||
|
||||
class Pandoc(CMDLine):
|
||||
""" Pandoc command line call with piped in- and output """
|
||||
|
||||
|
@ -254,23 +105,68 @@ class Pandoc(CMDLine):
|
|||
return stdout.decode('utf-8').strip()
|
||||
|
||||
|
||||
class HeadlessChromium(CMDLine):
|
||||
def __init__(self, url):
|
||||
super().__init__('chromium-browser')
|
||||
self.url = url
|
||||
class ExifTool(CMDLine):
|
||||
def __init__(self, fpath):
|
||||
self.fpath = fpath
|
||||
super().__init__('exiftool')
|
||||
|
||||
def get(self):
|
||||
@staticmethod
|
||||
def exifdate2iso(value):
|
||||
""" converts and EXIF date string to ISO 8601 format
|
||||
|
||||
:param value: EXIF date (2016:05:01 00:08:24)
|
||||
:type arg1: str
|
||||
:return: ISO 8601 string with UTC timezone 2016-05-01T00:08:24+0000
|
||||
:rtype: str
|
||||
"""
|
||||
if not isinstance(value, str):
|
||||
return value
|
||||
match = REGEX['exifdate'].match(value)
|
||||
if not match:
|
||||
return value
|
||||
return "%s-%s-%sT%s+0000" % (
|
||||
match.group('year'),
|
||||
match.group('month'),
|
||||
match.group('day'),
|
||||
match.group('time')
|
||||
)
|
||||
|
||||
def read(self):
|
||||
cmd = (
|
||||
self.executable,
|
||||
'--headless',
|
||||
'--disable-gpu',
|
||||
'--disable-preconnect',
|
||||
'--dump-dom',
|
||||
'--timeout 60',
|
||||
'--save-page-as-mhtml',
|
||||
"%s" % self.url
|
||||
'-sort',
|
||||
'-json',
|
||||
'-MIMEType',
|
||||
'-FileType',
|
||||
'-FileName',
|
||||
'-ModifyDate',
|
||||
'-CreateDate',
|
||||
'-DateTimeOriginal',
|
||||
'-ImageHeight',
|
||||
'-ImageWidth',
|
||||
'-Aperture',
|
||||
'-FOV',
|
||||
'-ISO',
|
||||
'-FocalLength',
|
||||
'-FNumber',
|
||||
'-FocalLengthIn35mmFormat',
|
||||
'-ExposureTime',
|
||||
'-Copyright',
|
||||
'-Artist',
|
||||
'-Model',
|
||||
'-GPSLongitude#',
|
||||
'-GPSLatitude#',
|
||||
'-LensID',
|
||||
'-LensSpec',
|
||||
'-Lens',
|
||||
'-ReleaseDate',
|
||||
'-Description',
|
||||
'-Headline',
|
||||
'-HierarchicalSubject',
|
||||
self.fpath
|
||||
)
|
||||
logging.debug('getting URL %s with headless chrome', self.url)
|
||||
|
||||
logging.debug('reading EXIF from %s', self.fpath)
|
||||
p = subprocess.Popen(
|
||||
cmd,
|
||||
stdin=subprocess.PIPE,
|
||||
|
@ -280,113 +176,111 @@ class HeadlessChromium(CMDLine):
|
|||
|
||||
stdout, stderr = p.communicate()
|
||||
if stderr:
|
||||
logging.error(
|
||||
"Error getting URL:\n\t%s\n\t%s",
|
||||
cmd,
|
||||
stderr
|
||||
)
|
||||
return stdout.decode('utf-8').strip()
|
||||
logging.error("Error reading EXIF:\n\t%s\n\t%s", cmd, stderr)
|
||||
|
||||
exif = json.loads(stdout.decode('utf-8').strip()).pop()
|
||||
if 'ReleaseDate' in exif and 'ReleaseTime' in exif:
|
||||
exif['DateTimeRelease'] = "%s %s" % (exif.get('ReleaseDate'), exif.get('ReleaseTime')[:8])
|
||||
del(exif['ReleaseDate'])
|
||||
del(exif['ReleaseTime'])
|
||||
|
||||
for k, v in exif.items():
|
||||
exif[k] = self.exifdate2iso(v)
|
||||
|
||||
return exif
|
||||
|
||||
def __expandconfig():
|
||||
c = configparser.ConfigParser(
|
||||
interpolation=configparser.ExtendedInterpolation(),
|
||||
allow_no_value=True
|
||||
)
|
||||
c.read('config.ini')
|
||||
|
||||
for s in c.sections():
|
||||
for o in c.options(s):
|
||||
curr = c.get(s, o)
|
||||
if 'photo' == s and 'regex' == o:
|
||||
REGEX.update({'photo': re.compile(curr)})
|
||||
c.set(s, o, os.path.expanduser(curr))
|
||||
|
||||
def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"):
|
||||
""" Used to create short, lowercase slug for a number (an epoch) passed """
|
||||
num = int(num)
|
||||
return ((num == 0) and numerals[0]) or (
|
||||
baseN(
|
||||
num // b,
|
||||
b,
|
||||
numerals
|
||||
).lstrip(numerals[0]) + numerals[num % b]
|
||||
)
|
||||
|
||||
|
||||
class wget(CMDLine):
|
||||
def __init__(self, url, dirname=None):
|
||||
super().__init__('wget')
|
||||
self.url = url
|
||||
self.slug = dirname or slugfname(self.url)
|
||||
self.saveto = os.path.join(
|
||||
config.get('source', 'offlinecopiesdir'),
|
||||
self.slug
|
||||
)
|
||||
def slugfname(url):
|
||||
return "%s" % slugify(
|
||||
re.sub(r"^https?://(?:www)?", "", url),
|
||||
only_ascii=True,
|
||||
lower=True
|
||||
)[:200]
|
||||
|
||||
def archive(self):
|
||||
cmd = (
|
||||
self.executable,
|
||||
'-e',
|
||||
'robots=off',
|
||||
'--timeout=360',
|
||||
'--no-clobber',
|
||||
'--no-directories',
|
||||
'--adjust-extension',
|
||||
'--span-hosts',
|
||||
'--wait=1',
|
||||
'--random-wait',
|
||||
'--convert-links',
|
||||
#'--backup-converted',
|
||||
'--page-requisites',
|
||||
'--directory-prefix=%s' % self.saveto,
|
||||
"%s" % self.url
|
||||
)
|
||||
logging.debug('getting URL %s with wget', self.url)
|
||||
p = subprocess.Popen(
|
||||
cmd,
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
def __setup_sitevars():
|
||||
SiteVars = {}
|
||||
section = 'site'
|
||||
for o in config.options(section):
|
||||
SiteVars.update({o: config.get(section, o)})
|
||||
|
||||
stdout, stderr = p.communicate()
|
||||
if stderr:
|
||||
logging.error(
|
||||
"Error getting URL:\n\t%s\n\t%s",
|
||||
cmd,
|
||||
stderr
|
||||
)
|
||||
return stdout.decode('utf-8').strip()
|
||||
# add site author
|
||||
section = 'author'
|
||||
SiteVars.update({section: {}})
|
||||
for o in config.options(section):
|
||||
SiteVars[section].update({o: config.get(section, o)})
|
||||
|
||||
def find_realurl(url):
|
||||
headers = requests.utils.default_headers()
|
||||
headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
|
||||
})
|
||||
# add extra sections to author
|
||||
for sub in config.get('author', 'appendwith').split():
|
||||
SiteVars[section].update({sub: {}})
|
||||
for o in config.options(sub):
|
||||
SiteVars[section][sub].update({o: config.get(sub, o)})
|
||||
|
||||
try:
|
||||
r = requests.get(
|
||||
url,
|
||||
allow_redirects=True,
|
||||
timeout=60,
|
||||
headers=headers
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error('getting real url failed: %s', e)
|
||||
return (None, 400)
|
||||
# push the whole thing into cache
|
||||
return SiteVars
|
||||
|
||||
finalurl = list(urlparse(r.url))
|
||||
finalurl[4] = '&'.join(
|
||||
[x for x in finalurl[4].split('&') if not x.startswith('utm_')])
|
||||
finalurl = urlunparse(finalurl)
|
||||
|
||||
return (finalurl, r.status_code)
|
||||
ARROWFORMAT = {
|
||||
'iso': 'YYYY-MM-DDTHH:mm:ssZ',
|
||||
'display': 'YYYY-MM-DD HH:mm'
|
||||
}
|
||||
|
||||
def find_archiveorgurl(url):
|
||||
url, status = find_realurl(url)
|
||||
if status == requests.codes.ok:
|
||||
return url
|
||||
LLEVEL = {
|
||||
'critical': 50,
|
||||
'error': 40,
|
||||
'warning': 30,
|
||||
'info': 20,
|
||||
'debug': 10
|
||||
}
|
||||
|
||||
try:
|
||||
a = requests.get(
|
||||
"http://archive.org/wayback/available?url=%s" % url,
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error('Failed to fetch archive.org availability for %s' % url)
|
||||
return None
|
||||
REGEX = {
|
||||
'exifdate': re.compile(
|
||||
r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+'
|
||||
r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$'
|
||||
),
|
||||
'cleanurl': re.compile(r"^https?://(?:www)?"),
|
||||
'urls': re.compile(
|
||||
r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+'
|
||||
r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*'
|
||||
),
|
||||
'mdimg': re.compile(
|
||||
r'(?P<shortcode>\!\[(?P<alt>[^\]]+)\]\((?P<fname>[^\s]+)'
|
||||
r'(?:\s[\'\"](?P<title>[^\"\']+)[\'\"])?\)(?:\{(?P<css>[^\}]+)\})?)',
|
||||
re.IGNORECASE
|
||||
)
|
||||
}
|
||||
|
||||
if not a:
|
||||
logging.error('empty archive.org availability for %s' % url)
|
||||
return None
|
||||
config = __expandconfig()
|
||||
|
||||
try:
|
||||
a = json.loads(a.text)
|
||||
aurl = a.get(
|
||||
'archived_snapshots', {}
|
||||
).get(
|
||||
'closest', {}
|
||||
).get(
|
||||
'url', None
|
||||
)
|
||||
if aurl:
|
||||
logging.debug("found %s in archive.org for %s", aurl, url)
|
||||
return aurl
|
||||
except Exception as e:
|
||||
logging.error("archive.org parsing failed: %s", e)
|
||||
j2 = jinja2.Environment(
|
||||
loader=jinja2.FileSystemLoader(
|
||||
searchpath=config.get('dirs', 'tmpl')
|
||||
),
|
||||
lstrip_blocks=True
|
||||
)
|
||||
|
||||
return None
|
||||
site = __setup_sitevars()
|
||||
|
|
Loading…
Reference in a new issue