nasg/shared.py

393 lines
10 KiB
Python
Raw Normal View History

2017-05-23 11:14:47 +01:00
import configparser
import os
import re
import glob
import logging
import subprocess
2017-06-28 12:20:26 +01:00
import json
import requests
from urllib.parse import urlparse, urlunparse
2017-06-28 12:20:26 +01:00
2017-05-23 11:14:47 +01:00
from whoosh import fields
from whoosh import analysis
2017-06-28 12:20:26 +01:00
from slugify import slugify
2017-06-12 15:40:30 +01:00
2017-07-05 22:09:06 +01:00
LLEVEL = {
'critical': 50,
'error': 40,
'warning': 30,
'info': 20,
'debug': 10
}
2017-05-23 11:14:47 +01:00
def __expandconfig(config):
""" add the dirs to the config automatically """
basepath = os.path.expanduser(config.get('common','base'))
config.set('common', 'basedir', basepath)
for section in ['source', 'target']:
for option in config.options(section):
opt = config.get(section, option)
config.set(section, "%sdir" % option, os.path.join(basepath,opt))
config.set('target', 'filesdir', os.path.join(
config.get('target', 'builddir'),
config.get('source', 'files'),
))
config.set('target', 'commentsdir', os.path.join(
config.get('target', 'builddir'),
config.get('site', 'commentspath'),
))
2017-05-23 11:14:47 +01:00
return config
2017-06-12 15:40:30 +01:00
2017-06-12 15:17:29 +01:00
def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"):
2017-06-12 15:40:30 +01:00
""" Used to create short, lowercase slug for a number (an epoch) passed """
2017-06-12 15:17:29 +01:00
num = int(num)
return ((num == 0) and numerals[0]) or (
baseN(
num // b,
b,
numerals
).lstrip(numerals[0]) + numerals[num % b]
)
2017-06-28 12:20:26 +01:00
def slugfname(url):
return "%s" % slugify(
re.sub(r"^https?://(?:www)?", "", url),
only_ascii=True,
lower=True
)[:200]
2017-06-12 15:17:29 +01:00
ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ'
STRFISO = '%Y-%m-%dT%H:%M:%S%z'
2017-05-23 11:14:47 +01:00
URLREGEX = re.compile(
r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+'
r'\.[a-zA-Z0-9\.\/\?\:@\-_=#]*'
)
EXIFREXEG = re.compile(
r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+'
r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$'
)
MDIMGREGEX = re.compile(
r'(!\[(.*)\]\((?:\/(?:files|cache)'
r'(?:\/[0-9]{4}\/[0-9]{2})?\/(.*\.(?:jpe?g|png|gif)))'
r'(?:\s+[\'\"]?(.*?)[\'\"]?)?\)(?:\{(.*?)\})?)'
, re.IGNORECASE)
schema = fields.Schema(
url=fields.ID(
stored=True,
unique=True
),
category=fields.TEXT(
2017-05-23 11:14:47 +01:00
stored=True,
),
date=fields.DATETIME(
stored=True,
sortable=True
),
title=fields.TEXT(
2017-05-23 11:14:47 +01:00
stored=True,
analyzer=analysis.FancyAnalyzer()
2017-05-23 11:14:47 +01:00
),
weight=fields.NUMERIC(
sortable=True
),
img=fields.TEXT(
stored=True
),
content=fields.TEXT(
stored=True,
analyzer=analysis.FancyAnalyzer()
),
fuzzy=fields.NGRAMWORDS(
tokenizer=analysis.NgramTokenizer(4)
),
mtime=fields.NUMERIC(
stored=True
2017-05-23 11:14:47 +01:00
)
#slug=fields.NGRAMWORDS(
#tokenizer=analysis.NgramTokenizer(4)
#),
#reactions=fields.NGRAMWORDS(
#tokenizer=analysis.NgramTokenizer(4)
#),
#tags=fields.TEXT(
#stored=False,
#analyzer=analysis.KeywordAnalyzer(
#lowercase=True,
#commas=True
#),
#),
2017-05-23 11:14:47 +01:00
)
config = configparser.ConfigParser(
interpolation=configparser.ExtendedInterpolation(),
allow_no_value=True
)
config.read('config.ini')
config = __expandconfig(config)
class CMDLine(object):
def __init__(self, executable):
self.executable = self._which(executable)
if self.executable is None:
raise OSError('No %s found in PATH!' % executable)
return
2017-06-12 15:40:30 +01:00
@staticmethod
def _which(name):
for d in os.environ['PATH'].split(':'):
which = glob.glob(os.path.join(d, name), recursive=True)
if which:
return which.pop()
return None
2017-06-12 15:40:30 +01:00
def __enter__(self):
self.process = subprocess.Popen(
[self.executable, "-stay_open", "True", "-@", "-"],
universal_newlines=True,
2017-06-02 11:19:55 +01:00
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
return self
2017-06-12 15:40:30 +01:00
def __exit__(self, exc_type, exc_value, traceback):
self.process.stdin.write("-stay_open\nFalse\n")
self.process.stdin.flush()
2017-06-12 15:40:30 +01:00
def execute(self, *args):
args = args + ("-execute\n",)
self.process.stdin.write(str.join("\n", args))
self.process.stdin.flush()
output = ""
fd = self.process.stdout.fileno()
while not output.endswith(self.sentinel):
output += os.read(fd, 4096).decode('utf-8', errors='ignore')
return output[:-len(self.sentinel)]
2017-06-12 15:40:30 +01:00
class Pandoc(CMDLine):
""" Pandoc command line call with piped in- and output """
2017-06-12 15:40:30 +01:00
def __init__(self, md2html=True):
super().__init__('pandoc')
if True == md2html:
self.i = "markdown+" + "+".join([
'backtick_code_blocks',
'auto_identifiers',
'fenced_code_attributes',
'definition_lists',
'grid_tables',
'pipe_tables',
'strikeout',
'superscript',
'subscript',
'markdown_in_html_blocks',
'shortcut_reference_links',
'autolink_bare_uris',
'raw_html',
'link_attributes',
'header_attributes',
'footnotes',
])
self.o = 'html5'
elif 'plain' == md2html:
self.i = "markdown+" + "+".join([
'backtick_code_blocks',
'auto_identifiers',
'fenced_code_attributes',
'definition_lists',
'grid_tables',
'pipe_tables',
'strikeout',
'superscript',
'subscript',
'markdown_in_html_blocks',
'shortcut_reference_links',
'autolink_bare_uris',
'raw_html',
'link_attributes',
'header_attributes',
'footnotes',
])
self.o = "plain"
else:
self.o = "markdown-" + "-".join([
'raw_html',
'native_divs',
'native_spans',
])
self.i = 'html'
2017-06-12 15:40:30 +01:00
def convert(self, text):
cmd = (
self.executable,
'-o-',
'--from=%s' % self.i,
'--to=%s' % self.o
)
logging.debug('converting string with Pandoc')
p = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = p.communicate(input=text.encode())
if stderr:
logging.error(
"Error during pandoc covert:\n\t%s\n\t%s",
cmd,
stderr
)
return stdout.decode('utf-8').strip()
class HeadlessChromium(CMDLine):
def __init__(self, url):
super().__init__('chromium-browser')
self.url = url
def get(self):
cmd = (
self.executable,
'--headless',
'--disable-gpu',
'--disable-preconnect',
'--dump-dom',
'--timeout 60',
'--save-page-as-mhtml',
"%s" % self.url
)
logging.debug('getting URL %s with headless chrome', self.url)
p = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = p.communicate()
if stderr:
logging.error(
"Error getting URL:\n\t%s\n\t%s",
cmd,
stderr
)
return stdout.decode('utf-8').strip()
class wget(CMDLine):
def __init__(self, url, dirname=None):
super().__init__('wget')
self.url = url
self.slug = dirname or slugfname(self.url)
self.saveto = os.path.join(
config.get('source', 'offlinecopiesdir'),
self.slug
)
def archive(self):
cmd = (
self.executable,
'-e',
'robots=off',
'--timeout=360',
'--no-clobber',
'--no-directories',
'--adjust-extension',
'--span-hosts',
'--wait=1',
'--random-wait',
'--convert-links',
#'--backup-converted',
'--page-requisites',
'--directory-prefix=%s' % self.saveto,
"%s" % self.url
)
logging.debug('getting URL %s with wget', self.url)
p = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = p.communicate()
if stderr:
logging.error(
"Error getting URL:\n\t%s\n\t%s",
cmd,
stderr
)
return stdout.decode('utf-8').strip()
def find_realurl(url):
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
try:
r = requests.get(
url,
allow_redirects=True,
timeout=60,
headers=headers
)
except Exception as e:
logging.error('getting real url failed: %s', e)
return (None, 400)
finalurl = list(urlparse(r.url))
finalurl[4] = '&'.join(
[x for x in finalurl[4].split('&') if not x.startswith('utm_')])
finalurl = urlunparse(finalurl)
return (finalurl, r.status_code)
def find_archiveorgurl(url):
url, status = find_realurl(url)
if status == requests.codes.ok:
return url
try:
a = requests.get(
"http://archive.org/wayback/available?url=%s" % url,
)
except Exception as e:
logging.error('Failed to fetch archive.org availability for %s' % url)
return None
if not a:
logging.error('empty archive.org availability for %s' % url)
return None
try:
a = json.loads(a.text)
aurl = a.get(
'archived_snapshots', {}
).get(
'closest', {}
).get(
'url', None
)
if aurl:
logging.debug("found %s in archive.org for %s", aurl, url)
return aurl
except Exception as e:
logging.error("archive.org parsing failed: %s", e)
return None