This commit is contained in:
Peter Molnar 2017-06-28 11:20:26 +00:00
parent 7c0daa0904
commit 70bd917de4
4 changed files with 386 additions and 67 deletions

311
nasg.py
View file

@ -16,6 +16,9 @@ import math
import asyncio
import csv
import getpass
import quopri
import base64
import mimetypes
import magic
import arrow
@ -33,6 +36,7 @@ from webmentiontools.send import WebmentionSend
from bleach import clean
from emoji import UNICODE_EMOJI
from bs4 import BeautifulSoup
from readability.readability import Document
import shared
def splitpath(path):
@ -89,7 +93,8 @@ class BaseRenderable(object):
return
def writerendered(self, content):
def writerendered(self, content, mtime=None):
mtime = mtime or self.mtime
d = os.path.dirname(self.target)
if not os.path.isdir(d):
os.mkdir(d)
@ -98,7 +103,7 @@ class BaseRenderable(object):
logging.debug('writing %s', self.target)
html.write(content)
html.close()
os.utime(self.target, (self.mtime, self.mtime))
os.utime(self.target, (mtime, mtime))
class Indexer(object):
@ -197,14 +202,25 @@ class Indexer(object):
self.writer.commit()
class OfflineCopy(object):
def __init__(self, url):
class OfflineArchive(object):
# keep in mind that these are frontmattered HTML files with full HTML and embedded images
# they can get VERY large
def __init__(self, url, content=None, decode_email=False):
self.url = url
self.fname = "%s.md" % slugify(re.sub(r"^https?://", "", url))[:200]
self.parsed = urllib.parse.urlparse(url)
self.fbase = shared.slugfname(url)
self.fname = "%s.md" % self.fbase
self.target = os.path.join(
shared.config.get('source', 'offlinecopiesdir'),
self.fname
)
self.targetd = os.path.join(
shared.config.get('source', 'offlinecopiesdir'),
self.fbase
)
if not os.path.isdir(self.targetd):
os.mkdir(self.targetd)
self.fm = frontmatter.loads('')
self.fm.metadata = {
'url': self.url,
@ -215,36 +231,152 @@ class OfflineCopy(object):
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
def __repr__(self):
return self.fm.content
self.skip_fetch = False
if content:
self.skip_fetch = True
if decode_email:
content = quopri.decodestring(content)
content = str(content, 'utf-8', errors='replace')
self.fm.content = content
#self.tmp = tempfile.mkdtemp(
#'offlinearchive_',
#dir=tempfile.gettempdir()
#)
#atexit.register(
#shutil.rmtree,
#os.path.abspath(self.tmp)
#)
#self.images = []
def write(self):
self.exists = os.path.isfile(self.target)
def _getimage(self, src):
imgname, imgext = os.path.splitext(os.path.basename(src))
imgtarget = os.path.join(
self.targetd,
"%s%s" % (slugify(imgname, only_ascii=True, lower=True), imgext)
)
try:
logging.debug('donwloading image %s', src)
r = requests.get(
src,
allow_redirects=True,
timeout=60,
stream=True
)
with open(imgtarget, 'wb') as f:
for chunk in r.iter_content():
if chunk:
f.write(chunk)
self.fm.content = self.fm.content.replace(
src,
'%s/%s' % (self.fbase, imgname)
)
except Exception as e:
logging.error('pulling image %s failed: %s', src, e)
return
def _get_images(self):
logging.debug("trying to save images")
soup = BeautifulSoup(self.fm.content, 'lxml')
embedded = re.compile(r'^data:.*')
for img in soup.find_all('img'):
src = img.get('src')
if not src:
continue
if embedded.match(src):
continue
im = urllib.parse.urlparse(src)
if not im.scheme:
im = im._replace(scheme=self.parsed.scheme)
if not im.netloc:
im = im._replace(netloc=self.parsed.netloc)
self._getimage(im.geturl())
#def _getimage(self, src):
#tmp = os.path.join(self.tmp, "%s" % slugify(os.path.basename(src))[:200])
#try:
#r = requests.get(
#src,
#allow_redirects=True,
#timeout=60,
#stream=True
#)
#with open(tmp, 'wb') as f:
#for chunk in r.iter_content():
#if chunk:
#f.write(chunk)
#logging.debug('trying to embed %s', src)
#with open(tmp, 'rb') as imgdata:
#data = str(base64.b64encode(imgdata.read()), 'ascii')
#mimetype, encoding = mimetypes.guess_type(tmp)
#self.fm.content = self.fm.content.replace(
#src,
#"data:%s;base64,%s" % (mimetype, data)
#)
#except Exception as e:
#logging.error('pulling image %s failed: %s', src, e)
#return
#def _embed_images(self):
#logging.debug("trying to embed images")
#soup = BeautifulSoup(self.fm.content, 'lxml')
#embedded = re.compile(r'^data:.*')
#for img in soup.find_all('img'):
#src = img.get('src')
#if not src:
#continue
#if embedded.match(src):
#continue
#im = urllib.parse.urlparse(src)
#if not im.scheme:
#im = im._replace(scheme=self.parsed.scheme)
#if not im.netloc:
#im = im._replace(netloc=self.parsed.netloc)
#self._getimage(im.geturl())
def save(self):
logging.info(
"savig offline copy of\n\t%s to:\n\t%s",
self.url,
self.target
)
with open(self.target, 'wt') as f:
f.write(frontmatter.dumps(self.fm))
@property
def archiveorgurl(self):
logging.debug("trying archive.org for %s", self.url)
a = self.fetch(
"http://archive.org/wayback/available?url=%s" % self.url,
)
if not a:
logging.debug("no entry for %s on archive.org", self.url)
return None
try:
a = json.loads(a.text)
return a.get(
aurl = a.get(
'archived_snapshots', {}
).get(
'closest', {}
).get(
'url', None
)
logging.debug("found %s in archive.org for %s", aurl, self.url)
self.updateurl(aurl)
return self.fetch(aurl)
except Exception as e:
logging.error("archive.org parsing failed: %s", e)
return None
@ -264,24 +396,40 @@ class OfflineCopy(object):
return None
def run(self):
def read():
if os.path.isfile(self.target):
with open(self.target) as f:
self.fm = frontmatter.loads(f.read())
return
logging.info("prepairing offline copy of %s", self.url)
r = self.fetch(self.url)
if not r:
r = self.fetch(self.archiveorgurl)
if r:
if r.url != self.url:
self.fm.metadata['realurl'] = r.url
def run(self):
if self.exists:
logging.info("offline archive for %s already exists", self.url)
return
logging.info("prepairing offline copy of %s", self.url)
if not self.skip_fetch:
r = self.fetch(self.url)
# in case it's not, try to look for an archive.org url:
if not r:
logging.warning("couldn't get live version of %s, trying archive.org", self.url)
r = self.fetch(self.archiveorgurl)
# no live and no archive.org entry :((
# howver, by miracle, I may already have a copy, so skip if it's there already
if not r:
logging.error("no live or archive version of %s found :((", self.url)
if not self.exists:
self.save()
return
self.fm.content = r.text
self.write()
return
self._get_images()
self.save()
class Renderer(object):
@ -302,9 +450,10 @@ class Renderer(object):
@staticmethod
def jinja_filter_date(d, form='%Y-%m-%d %H:%m:%S'):
if d == 'now':
return arrow.now().strftime(form)
d = arrow.now().datetime
if form == 'c':
form = '%Y-%m-%dT%H:%M:%S%z'
return d.isoformat()
#form = '%Y-%m-%dT%H:%M:%S%z'
return d.strftime(form)
@ -422,7 +571,7 @@ class Comment(BaseRenderable):
'content': self.content,
'html': self.html,
'source': self.source,
'target': self.target,
'target': self.targeturl,
'type': self.meta.get('type', 'webmention'),
'reacji': self.reacji,
'fname': self.fname
@ -456,34 +605,43 @@ class Comment(BaseRenderable):
return self._source
@property
def targeturl(self):
if hasattr(self, '_targeturl'):
return self._targeturl
t = self.meta.get('target', shared.config.get('site', 'url'))
self._targeturl = '{p.path}'.format(p=urllib.parse.urlparse(t)).strip('/')
return self._targeturl
@property
def target(self):
if hasattr(self, '_target'):
return self._target
t = self.meta.get('target', shared.config.get('site', 'url'))
self._target = '{p.path}'.format(p=urllib.parse.urlparse(t)).strip('/')
return self._target
async def render(self, renderer):
logging.info("rendering and saving comment %s", self.fname)
targetdir = os.path.abspath(os.path.join(
shared.config.get('target', 'builddir'),
shared.config.get('site', 'commentspath'),
self.fname
))
target = os.path.join(targetdir, 'index.html')
if not shared.config.getboolean('params', 'force') and os.path.isfile(target):
ttime = int(os.path.getmtime(target))
self._target = os.path.join(targetdir, 'index.html')
return self._target
async def render(self, renderer):
logging.info("rendering and saving comment %s", self.fname)
if not shared.config.getboolean('params', 'force') and os.path.isfile(self.target):
ttime = int(os.path.getmtime(self.target))
logging.debug('ttime is %d mtime is %d', ttime, self.mtime)
if ttime == self.mtime:
logging.debug('%s exists and up-to-date (lastmod: %d)', target, ttime)
logging.debug(
'%s exists and up-to-date (lastmod: %d)',
self.target,
ttime
)
return
#if not os.path.isdir(targetdir):
#os.mkdir(targetdir)
tmplvars = {
'reply': self.tmplvars,
'site': renderer.sitevars,
@ -719,7 +877,8 @@ class WebImage(object):
self._rssenclosure = {
'mime': magic.Magic(mime=True).from_file(target['fpath']),
'url': target['url'],
'size': os.path.getsize(target['fpath'])
'size': os.path.getsize(target['fpath']),
'fname': self.fname
}
return self._rssenclosure
@ -976,8 +1135,8 @@ class Taxonomy(BaseIter):
async def render(self, renderer):
if not self.slug or self.slug is 'None':
return
#if not self.slug or self.slug is 'None':
#return
self.__mkdirs()
page = 1
@ -1031,24 +1190,20 @@ class Taxonomy(BaseIter):
os.utime(target, (self.mtime, self.mtime))
if 1 == page:
target = os.path.join(self.feedp, 'index.rss')
logging.info("rendering RSS feed to %s", target)
r = renderer.j2.get_template('rss.html').render(tmplvars)
#target = os.path.join(self.feedp, 'index.rss')
#logging.info("rendering RSS feed to %s", target)
#r = renderer.j2.get_template('rss.html').render(tmplvars)
#with open(target, "wt") as html:
#html.write(r)
#os.utime(target, (self.mtime, self.mtime))
target = os.path.join(self.feedp, 'index.atom')
logging.info("rendering Atom feed to %s", target)
r = renderer.j2.get_template('atom.html').render(tmplvars)
with open(target, "wt") as html:
html.write(r)
os.utime(target, (self.mtime, self.mtime))
if not self.taxonomy or self.taxonomy == 'category':
t = shared.config.get('site', 'websuburl')
data = {
'hub.mode': 'publish',
'hub.url': "%s%s" % (
shared.config.get('site', 'url'), self.baseurl
)
}
logging.info("pinging %s with data %s", t, data)
requests.post(t, data=data)
# ---
# this is a joke
# see http://indieweb.org/YAMLFeed
@ -1081,6 +1236,18 @@ class Taxonomy(BaseIter):
os.utime(target, (self.mtime, self.mtime))
# ---
if 1 == page:
if not self.taxonomy or self.taxonomy == 'category':
t = shared.config.get('site', 'websuburl')
data = {
'hub.mode': 'publish',
'hub.url': "%s%s" % (
shared.config.get('site', 'url'), self.baseurl
)
}
logging.info("pinging %s with data %s", t, data)
requests.post(t, data=data)
class Content(BaseIter):
def __init__(self, images, comments, extensions=['md']):
@ -1557,7 +1724,7 @@ class Singular(BaseRenderable):
if not isinstance(maybe, list):
maybe = [maybe]
for url in maybe:
copies[url] = OfflineCopy(url)
copies[url] = OfflineArchive(url)
copies[url].run()
self.copies = copies
@ -1601,7 +1768,8 @@ class Singular(BaseRenderable):
'slug': self.fname,
'shortslug': self.shortslug,
'rssenclosure': self.rssenclosure,
'copies': self.offlinecopies,
#'copies': self.offlinecopies,
'copies': [],
'comments': self.comments,
'replies': self.replies,
'reacjis': self.reacjis,
@ -1617,6 +1785,15 @@ class Singular(BaseRenderable):
return self._shortslug
@property
def target(self):
targetdir = os.path.abspath(os.path.join(
shared.config.get('target', 'builddir'),
self.fname
))
return os.path.join(targetdir, 'index.html')
async def rendercomments(self, renderer):
for comment in self.comments:
await comment.render(renderer)
@ -1638,17 +1815,15 @@ class Singular(BaseRenderable):
mtime = lctime
logging.info("rendering and saving %s", self.fname)
targetdir = os.path.abspath(os.path.join(
shared.config.get('target', 'builddir'),
self.fname
))
target = os.path.join(targetdir, 'index.html')
if not shared.config.getboolean('params', 'force') and os.path.isfile(target):
ttime = int(os.path.getmtime(target))
if not shared.config.getboolean('params', 'force') and os.path.isfile(self.target):
ttime = int(os.path.getmtime(self.target))
logging.debug('ttime is %d mtime is %d', ttime, mtime)
if ttime == mtime:
logging.debug('%s exists and up-to-date (lastmod: %d)', target, ttime)
logging.debug(
'%s exists and up-to-date (lastmod: %d)',
self.target,
ttime
)
return
tmplvars = {
@ -1657,7 +1832,7 @@ class Singular(BaseRenderable):
'taxonomy': {},
}
r = renderer.j2.get_template(self.tmplfile).render(tmplvars)
self.writerendered(target, r, mtime)
self.writerendered(r, mtime)
async def ping(self, pinger):
@ -1746,6 +1921,12 @@ class NASG(object):
default=False,
help='skip rendering'
)
parser.add_argument(
'--refetch',
action='store_true',
default=False,
help='force re-fetching offline archives'
)
params = vars(parser.parse_args())
shared.config.add_section('params')

2
new.py Normal file → Executable file
View file

@ -119,7 +119,7 @@ if __name__ == '__main__':
doc.content = content
tmpsave = os.path.join(tempfile.gettempdir(), "%s.md" % slug)
saveto = input('Save to: [%s]: ' % categories) or tmpsave
saveto = input('Save to: [%s]: ' % categories) or 'bookmark'
if tmpsave != saveto:
saveto = os.path.join(shared.config.get('source', 'contentdir'), saveto, "%s.md" % slug)

View file

@ -14,6 +14,22 @@ from slugify import slugify
from pprint import pprint
""" TODO
- following from:
- tumblr
- deviantart
- flickr
- wordpress.com
- twitter
- 500px
"""
class Bookmark(object):
def __init__(self, title, url, fname=None):
self.fm = frontmatter.loads('')
@ -126,6 +142,37 @@ class Fav(object):
os.utime(self.target, (self.arrow.timestamp, self.arrow.timestamp))
class PinterestFav(Fav):
def __init__(self, url):
super(PinterestFav, self).__init__()
self.url = url
self.fname = "pinterest-%s.md" % (list(filter(None, url.split('/')))[-1])
def run(self):
try:
r = requests.get(self.url)
soup = bs4.BeautifulSoup(r.text, 'lxml')
ld = json.loads(soup.find('script', type='application/ld+json').text)
imgurl = ld.get('image')
self.saveimg(imgurl)
self.fm.metadata = {
'published': arrow.get(
ld.get('datePublished', arrow.utcnow().timestamp)
).format(shared.ARROWISO),
'title': ld.get('headline', self.url),
'favorite-of': self.url,
'image': self.imgname
}
content = ld.get('articleBody', '')
content = shared.Pandoc(False).convert(content)
self.fm.content = content
except Exception as e:
logging.error('saving pinterest fav %s failed: %s', self.url, e)
return
class FlickrFav(Fav):
def __init__(self, photo):
super(FlickrFav, self).__init__()
@ -280,6 +327,31 @@ class FivehpxFavs(Favs):
fav.write()
#class Following(object):
#def __init__(self, confgroup):
#self.confgroup = confgroup
#self.url = shared.config.get(confgroup, 'fav_api')
#class FlickrFollowing(Following):
#def __init__(self):
#super(FlickrFollowing, self).__init__('flickr')
#self.params = {
#'method': 'flickr.contacts.getList',
#'api_key': shared.config.get('flickr', 'api_key'),
#'format': 'json',
#'nojsoncallback': '1',
#}
#def run(self):
#r = requests.get(self.url,params=self.params)
#js = json.loads(r.text)
#pprint(js)
#for contact in js.get('contacts', {}).get('contact', []):
#pprint(contact)
if __name__ == '__main__':
while len(logging.root.handlers) > 0:
logging.root.removeHandler(logging.root.handlers[-1])
@ -297,3 +369,6 @@ if __name__ == '__main__':
fivehpx = FivehpxFavs()
fivehpx.run()
#flickrfollow = FlickrFollowing()
#flickrfollow.run()

View file

@ -4,9 +4,11 @@ import re
import glob
import logging
import subprocess
import json
from whoosh import fields
from whoosh import analysis
from slugify import slugify
def __expandconfig(config):
""" add the dirs to the config automatically """
@ -38,6 +40,8 @@ def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"):
).lstrip(numerals[0]) + numerals[num % b]
)
def slugfname(url):
return "%s" % slugify(re.sub(r"^https?://(?:www)?", "", url))[:200]
ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ'
STRFISO = '%Y-%m-%dT%H:%M:%S%z'
@ -104,6 +108,65 @@ config.read('config.ini')
config = __expandconfig(config)
class TokenDB(object):
def __init__(self):
self.db = os.path.abspath(os.path.join(
config.get('common', 'basedir'),
'tokens.json'
))
self.tokens = {}
self.refresh()
def refresh(self):
if os.path.isfile(self.db):
with open(self.db, 'rt') as f:
self.tokens = json.loads(f.read())
def save(self):
with open(self.db, 'wt') as f:
f.write(
json.dumps(
self.tokens, indent=4, sort_keys=True
)
)
self.refresh()
def get_token(self, token):
return self.tokens.get(token, None)
def get_service(self, service):
s = self.tokens.get(service, None)
if s:
s = self.get_token(s)
return s
def set_service(self, service, token):
self.tokens.update({
service: token
})
#self.save()
def set_token(self, token, secret):
self.tokens.update({
token: {
'oauth_token': token,
'oauth_token_secret': secret
}
})
#self.save()
def set_verifier(self, token, verifier):
t = self.tokens.get(token)
t.update({
'verifier': verifier
})
self.tokens.update({
token: t
})
#self.save()
tokendb = TokenDB()
class CMDLine(object):
def __init__(self, executable):
self.executable = self._which(executable)