This commit is contained in:
Peter Molnar 2017-06-28 11:20:26 +00:00
parent 7c0daa0904
commit 70bd917de4
4 changed files with 386 additions and 67 deletions

303
nasg.py
View file

@ -16,6 +16,9 @@ import math
import asyncio import asyncio
import csv import csv
import getpass import getpass
import quopri
import base64
import mimetypes
import magic import magic
import arrow import arrow
@ -33,6 +36,7 @@ from webmentiontools.send import WebmentionSend
from bleach import clean from bleach import clean
from emoji import UNICODE_EMOJI from emoji import UNICODE_EMOJI
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from readability.readability import Document
import shared import shared
def splitpath(path): def splitpath(path):
@ -89,7 +93,8 @@ class BaseRenderable(object):
return return
def writerendered(self, content): def writerendered(self, content, mtime=None):
mtime = mtime or self.mtime
d = os.path.dirname(self.target) d = os.path.dirname(self.target)
if not os.path.isdir(d): if not os.path.isdir(d):
os.mkdir(d) os.mkdir(d)
@ -98,7 +103,7 @@ class BaseRenderable(object):
logging.debug('writing %s', self.target) logging.debug('writing %s', self.target)
html.write(content) html.write(content)
html.close() html.close()
os.utime(self.target, (self.mtime, self.mtime)) os.utime(self.target, (mtime, mtime))
class Indexer(object): class Indexer(object):
@ -197,14 +202,25 @@ class Indexer(object):
self.writer.commit() self.writer.commit()
class OfflineCopy(object): class OfflineArchive(object):
def __init__(self, url): # keep in mind that these are frontmattered HTML files with full HTML and embedded images
# they can get VERY large
def __init__(self, url, content=None, decode_email=False):
self.url = url self.url = url
self.fname = "%s.md" % slugify(re.sub(r"^https?://", "", url))[:200] self.parsed = urllib.parse.urlparse(url)
self.fbase = shared.slugfname(url)
self.fname = "%s.md" % self.fbase
self.target = os.path.join( self.target = os.path.join(
shared.config.get('source', 'offlinecopiesdir'), shared.config.get('source', 'offlinecopiesdir'),
self.fname self.fname
) )
self.targetd = os.path.join(
shared.config.get('source', 'offlinecopiesdir'),
self.fbase
)
if not os.path.isdir(self.targetd):
os.mkdir(self.targetd)
self.fm = frontmatter.loads('') self.fm = frontmatter.loads('')
self.fm.metadata = { self.fm.metadata = {
'url': self.url, 'url': self.url,
@ -215,36 +231,152 @@ class OfflineCopy(object):
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
}) })
def __repr__(self): self.skip_fetch = False
return self.fm.content if content:
self.skip_fetch = True
if decode_email:
content = quopri.decodestring(content)
content = str(content, 'utf-8', errors='replace')
self.fm.content = content
#self.tmp = tempfile.mkdtemp(
#'offlinearchive_',
#dir=tempfile.gettempdir()
#)
#atexit.register(
#shutil.rmtree,
#os.path.abspath(self.tmp)
#)
#self.images = []
def write(self): self.exists = os.path.isfile(self.target)
def _getimage(self, src):
imgname, imgext = os.path.splitext(os.path.basename(src))
imgtarget = os.path.join(
self.targetd,
"%s%s" % (slugify(imgname, only_ascii=True, lower=True), imgext)
)
try:
logging.debug('donwloading image %s', src)
r = requests.get(
src,
allow_redirects=True,
timeout=60,
stream=True
)
with open(imgtarget, 'wb') as f:
for chunk in r.iter_content():
if chunk:
f.write(chunk)
self.fm.content = self.fm.content.replace(
src,
'%s/%s' % (self.fbase, imgname)
)
except Exception as e:
logging.error('pulling image %s failed: %s', src, e)
return
def _get_images(self):
logging.debug("trying to save images")
soup = BeautifulSoup(self.fm.content, 'lxml')
embedded = re.compile(r'^data:.*')
for img in soup.find_all('img'):
src = img.get('src')
if not src:
continue
if embedded.match(src):
continue
im = urllib.parse.urlparse(src)
if not im.scheme:
im = im._replace(scheme=self.parsed.scheme)
if not im.netloc:
im = im._replace(netloc=self.parsed.netloc)
self._getimage(im.geturl())
#def _getimage(self, src):
#tmp = os.path.join(self.tmp, "%s" % slugify(os.path.basename(src))[:200])
#try:
#r = requests.get(
#src,
#allow_redirects=True,
#timeout=60,
#stream=True
#)
#with open(tmp, 'wb') as f:
#for chunk in r.iter_content():
#if chunk:
#f.write(chunk)
#logging.debug('trying to embed %s', src)
#with open(tmp, 'rb') as imgdata:
#data = str(base64.b64encode(imgdata.read()), 'ascii')
#mimetype, encoding = mimetypes.guess_type(tmp)
#self.fm.content = self.fm.content.replace(
#src,
#"data:%s;base64,%s" % (mimetype, data)
#)
#except Exception as e:
#logging.error('pulling image %s failed: %s', src, e)
#return
#def _embed_images(self):
#logging.debug("trying to embed images")
#soup = BeautifulSoup(self.fm.content, 'lxml')
#embedded = re.compile(r'^data:.*')
#for img in soup.find_all('img'):
#src = img.get('src')
#if not src:
#continue
#if embedded.match(src):
#continue
#im = urllib.parse.urlparse(src)
#if not im.scheme:
#im = im._replace(scheme=self.parsed.scheme)
#if not im.netloc:
#im = im._replace(netloc=self.parsed.netloc)
#self._getimage(im.geturl())
def save(self):
logging.info( logging.info(
"savig offline copy of\n\t%s to:\n\t%s", "savig offline copy of\n\t%s to:\n\t%s",
self.url, self.url,
self.target self.target
) )
with open(self.target, 'wt') as f: with open(self.target, 'wt') as f:
f.write(frontmatter.dumps(self.fm)) f.write(frontmatter.dumps(self.fm))
@property @property
def archiveorgurl(self): def archiveorgurl(self):
logging.debug("trying archive.org for %s", self.url)
a = self.fetch( a = self.fetch(
"http://archive.org/wayback/available?url=%s" % self.url, "http://archive.org/wayback/available?url=%s" % self.url,
) )
if not a: if not a:
logging.debug("no entry for %s on archive.org", self.url)
return None return None
try: try:
a = json.loads(a.text) a = json.loads(a.text)
return a.get( aurl = a.get(
'archived_snapshots', {} 'archived_snapshots', {}
).get( ).get(
'closest', {} 'closest', {}
).get( ).get(
'url', None 'url', None
) )
logging.debug("found %s in archive.org for %s", aurl, self.url)
self.updateurl(aurl)
return self.fetch(aurl)
except Exception as e: except Exception as e:
logging.error("archive.org parsing failed: %s", e) logging.error("archive.org parsing failed: %s", e)
return None return None
@ -264,24 +396,40 @@ class OfflineCopy(object):
return None return None
def run(self): def read():
if os.path.isfile(self.target): if os.path.isfile(self.target):
with open(self.target) as f: with open(self.target) as f:
self.fm = frontmatter.loads(f.read()) self.fm = frontmatter.loads(f.read())
return return
def run(self):
if self.exists:
logging.info("offline archive for %s already exists", self.url)
return
logging.info("prepairing offline copy of %s", self.url) logging.info("prepairing offline copy of %s", self.url)
if not self.skip_fetch:
r = self.fetch(self.url) r = self.fetch(self.url)
# in case it's not, try to look for an archive.org url:
if not r: if not r:
logging.warning("couldn't get live version of %s, trying archive.org", self.url)
r = self.fetch(self.archiveorgurl) r = self.fetch(self.archiveorgurl)
if r: # no live and no archive.org entry :((
if r.url != self.url: # howver, by miracle, I may already have a copy, so skip if it's there already
self.fm.metadata['realurl'] = r.url if not r:
logging.error("no live or archive version of %s found :((", self.url)
if not self.exists:
self.save()
return
self.fm.content = r.text self.fm.content = r.text
self.write() self._get_images()
return self.save()
class Renderer(object): class Renderer(object):
@ -302,9 +450,10 @@ class Renderer(object):
@staticmethod @staticmethod
def jinja_filter_date(d, form='%Y-%m-%d %H:%m:%S'): def jinja_filter_date(d, form='%Y-%m-%d %H:%m:%S'):
if d == 'now': if d == 'now':
return arrow.now().strftime(form) d = arrow.now().datetime
if form == 'c': if form == 'c':
form = '%Y-%m-%dT%H:%M:%S%z' return d.isoformat()
#form = '%Y-%m-%dT%H:%M:%S%z'
return d.strftime(form) return d.strftime(form)
@ -422,7 +571,7 @@ class Comment(BaseRenderable):
'content': self.content, 'content': self.content,
'html': self.html, 'html': self.html,
'source': self.source, 'source': self.source,
'target': self.target, 'target': self.targeturl,
'type': self.meta.get('type', 'webmention'), 'type': self.meta.get('type', 'webmention'),
'reacji': self.reacji, 'reacji': self.reacji,
'fname': self.fname 'fname': self.fname
@ -456,34 +605,43 @@ class Comment(BaseRenderable):
return self._source return self._source
@property
def targeturl(self):
if hasattr(self, '_targeturl'):
return self._targeturl
t = self.meta.get('target', shared.config.get('site', 'url'))
self._targeturl = '{p.path}'.format(p=urllib.parse.urlparse(t)).strip('/')
return self._targeturl
@property @property
def target(self): def target(self):
if hasattr(self, '_target'): if hasattr(self, '_target'):
return self._target return self._target
t = self.meta.get('target', shared.config.get('site', 'url'))
self._target = '{p.path}'.format(p=urllib.parse.urlparse(t)).strip('/')
return self._target
async def render(self, renderer):
logging.info("rendering and saving comment %s", self.fname)
targetdir = os.path.abspath(os.path.join( targetdir = os.path.abspath(os.path.join(
shared.config.get('target', 'builddir'), shared.config.get('target', 'builddir'),
shared.config.get('site', 'commentspath'), shared.config.get('site', 'commentspath'),
self.fname self.fname
)) ))
target = os.path.join(targetdir, 'index.html')
if not shared.config.getboolean('params', 'force') and os.path.isfile(target): self._target = os.path.join(targetdir, 'index.html')
ttime = int(os.path.getmtime(target)) return self._target
async def render(self, renderer):
logging.info("rendering and saving comment %s", self.fname)
if not shared.config.getboolean('params', 'force') and os.path.isfile(self.target):
ttime = int(os.path.getmtime(self.target))
logging.debug('ttime is %d mtime is %d', ttime, self.mtime) logging.debug('ttime is %d mtime is %d', ttime, self.mtime)
if ttime == self.mtime: if ttime == self.mtime:
logging.debug('%s exists and up-to-date (lastmod: %d)', target, ttime) logging.debug(
'%s exists and up-to-date (lastmod: %d)',
self.target,
ttime
)
return return
#if not os.path.isdir(targetdir):
#os.mkdir(targetdir)
tmplvars = { tmplvars = {
'reply': self.tmplvars, 'reply': self.tmplvars,
'site': renderer.sitevars, 'site': renderer.sitevars,
@ -719,7 +877,8 @@ class WebImage(object):
self._rssenclosure = { self._rssenclosure = {
'mime': magic.Magic(mime=True).from_file(target['fpath']), 'mime': magic.Magic(mime=True).from_file(target['fpath']),
'url': target['url'], 'url': target['url'],
'size': os.path.getsize(target['fpath']) 'size': os.path.getsize(target['fpath']),
'fname': self.fname
} }
return self._rssenclosure return self._rssenclosure
@ -976,8 +1135,8 @@ class Taxonomy(BaseIter):
async def render(self, renderer): async def render(self, renderer):
if not self.slug or self.slug is 'None': #if not self.slug or self.slug is 'None':
return #return
self.__mkdirs() self.__mkdirs()
page = 1 page = 1
@ -1031,24 +1190,20 @@ class Taxonomy(BaseIter):
os.utime(target, (self.mtime, self.mtime)) os.utime(target, (self.mtime, self.mtime))
if 1 == page: if 1 == page:
target = os.path.join(self.feedp, 'index.rss') #target = os.path.join(self.feedp, 'index.rss')
logging.info("rendering RSS feed to %s", target) #logging.info("rendering RSS feed to %s", target)
r = renderer.j2.get_template('rss.html').render(tmplvars) #r = renderer.j2.get_template('rss.html').render(tmplvars)
#with open(target, "wt") as html:
#html.write(r)
#os.utime(target, (self.mtime, self.mtime))
target = os.path.join(self.feedp, 'index.atom')
logging.info("rendering Atom feed to %s", target)
r = renderer.j2.get_template('atom.html').render(tmplvars)
with open(target, "wt") as html: with open(target, "wt") as html:
html.write(r) html.write(r)
os.utime(target, (self.mtime, self.mtime)) os.utime(target, (self.mtime, self.mtime))
if not self.taxonomy or self.taxonomy == 'category':
t = shared.config.get('site', 'websuburl')
data = {
'hub.mode': 'publish',
'hub.url': "%s%s" % (
shared.config.get('site', 'url'), self.baseurl
)
}
logging.info("pinging %s with data %s", t, data)
requests.post(t, data=data)
# --- # ---
# this is a joke # this is a joke
# see http://indieweb.org/YAMLFeed # see http://indieweb.org/YAMLFeed
@ -1081,6 +1236,18 @@ class Taxonomy(BaseIter):
os.utime(target, (self.mtime, self.mtime)) os.utime(target, (self.mtime, self.mtime))
# --- # ---
if 1 == page:
if not self.taxonomy or self.taxonomy == 'category':
t = shared.config.get('site', 'websuburl')
data = {
'hub.mode': 'publish',
'hub.url': "%s%s" % (
shared.config.get('site', 'url'), self.baseurl
)
}
logging.info("pinging %s with data %s", t, data)
requests.post(t, data=data)
class Content(BaseIter): class Content(BaseIter):
def __init__(self, images, comments, extensions=['md']): def __init__(self, images, comments, extensions=['md']):
@ -1557,7 +1724,7 @@ class Singular(BaseRenderable):
if not isinstance(maybe, list): if not isinstance(maybe, list):
maybe = [maybe] maybe = [maybe]
for url in maybe: for url in maybe:
copies[url] = OfflineCopy(url) copies[url] = OfflineArchive(url)
copies[url].run() copies[url].run()
self.copies = copies self.copies = copies
@ -1601,7 +1768,8 @@ class Singular(BaseRenderable):
'slug': self.fname, 'slug': self.fname,
'shortslug': self.shortslug, 'shortslug': self.shortslug,
'rssenclosure': self.rssenclosure, 'rssenclosure': self.rssenclosure,
'copies': self.offlinecopies, #'copies': self.offlinecopies,
'copies': [],
'comments': self.comments, 'comments': self.comments,
'replies': self.replies, 'replies': self.replies,
'reacjis': self.reacjis, 'reacjis': self.reacjis,
@ -1617,6 +1785,15 @@ class Singular(BaseRenderable):
return self._shortslug return self._shortslug
@property
def target(self):
targetdir = os.path.abspath(os.path.join(
shared.config.get('target', 'builddir'),
self.fname
))
return os.path.join(targetdir, 'index.html')
async def rendercomments(self, renderer): async def rendercomments(self, renderer):
for comment in self.comments: for comment in self.comments:
await comment.render(renderer) await comment.render(renderer)
@ -1638,17 +1815,15 @@ class Singular(BaseRenderable):
mtime = lctime mtime = lctime
logging.info("rendering and saving %s", self.fname) logging.info("rendering and saving %s", self.fname)
targetdir = os.path.abspath(os.path.join( if not shared.config.getboolean('params', 'force') and os.path.isfile(self.target):
shared.config.get('target', 'builddir'), ttime = int(os.path.getmtime(self.target))
self.fname
))
target = os.path.join(targetdir, 'index.html')
if not shared.config.getboolean('params', 'force') and os.path.isfile(target):
ttime = int(os.path.getmtime(target))
logging.debug('ttime is %d mtime is %d', ttime, mtime) logging.debug('ttime is %d mtime is %d', ttime, mtime)
if ttime == mtime: if ttime == mtime:
logging.debug('%s exists and up-to-date (lastmod: %d)', target, ttime) logging.debug(
'%s exists and up-to-date (lastmod: %d)',
self.target,
ttime
)
return return
tmplvars = { tmplvars = {
@ -1657,7 +1832,7 @@ class Singular(BaseRenderable):
'taxonomy': {}, 'taxonomy': {},
} }
r = renderer.j2.get_template(self.tmplfile).render(tmplvars) r = renderer.j2.get_template(self.tmplfile).render(tmplvars)
self.writerendered(target, r, mtime) self.writerendered(r, mtime)
async def ping(self, pinger): async def ping(self, pinger):
@ -1746,6 +1921,12 @@ class NASG(object):
default=False, default=False,
help='skip rendering' help='skip rendering'
) )
parser.add_argument(
'--refetch',
action='store_true',
default=False,
help='force re-fetching offline archives'
)
params = vars(parser.parse_args()) params = vars(parser.parse_args())
shared.config.add_section('params') shared.config.add_section('params')

2
new.py Normal file → Executable file
View file

@ -119,7 +119,7 @@ if __name__ == '__main__':
doc.content = content doc.content = content
tmpsave = os.path.join(tempfile.gettempdir(), "%s.md" % slug) tmpsave = os.path.join(tempfile.gettempdir(), "%s.md" % slug)
saveto = input('Save to: [%s]: ' % categories) or tmpsave saveto = input('Save to: [%s]: ' % categories) or 'bookmark'
if tmpsave != saveto: if tmpsave != saveto:
saveto = os.path.join(shared.config.get('source', 'contentdir'), saveto, "%s.md" % slug) saveto = os.path.join(shared.config.get('source', 'contentdir'), saveto, "%s.md" % slug)

View file

@ -14,6 +14,22 @@ from slugify import slugify
from pprint import pprint from pprint import pprint
""" TODO
- following from:
- tumblr
- deviantart
- flickr
- wordpress.com
- twitter
- 500px
"""
class Bookmark(object): class Bookmark(object):
def __init__(self, title, url, fname=None): def __init__(self, title, url, fname=None):
self.fm = frontmatter.loads('') self.fm = frontmatter.loads('')
@ -126,6 +142,37 @@ class Fav(object):
os.utime(self.target, (self.arrow.timestamp, self.arrow.timestamp)) os.utime(self.target, (self.arrow.timestamp, self.arrow.timestamp))
class PinterestFav(Fav):
def __init__(self, url):
super(PinterestFav, self).__init__()
self.url = url
self.fname = "pinterest-%s.md" % (list(filter(None, url.split('/')))[-1])
def run(self):
try:
r = requests.get(self.url)
soup = bs4.BeautifulSoup(r.text, 'lxml')
ld = json.loads(soup.find('script', type='application/ld+json').text)
imgurl = ld.get('image')
self.saveimg(imgurl)
self.fm.metadata = {
'published': arrow.get(
ld.get('datePublished', arrow.utcnow().timestamp)
).format(shared.ARROWISO),
'title': ld.get('headline', self.url),
'favorite-of': self.url,
'image': self.imgname
}
content = ld.get('articleBody', '')
content = shared.Pandoc(False).convert(content)
self.fm.content = content
except Exception as e:
logging.error('saving pinterest fav %s failed: %s', self.url, e)
return
class FlickrFav(Fav): class FlickrFav(Fav):
def __init__(self, photo): def __init__(self, photo):
super(FlickrFav, self).__init__() super(FlickrFav, self).__init__()
@ -280,6 +327,31 @@ class FivehpxFavs(Favs):
fav.write() fav.write()
#class Following(object):
#def __init__(self, confgroup):
#self.confgroup = confgroup
#self.url = shared.config.get(confgroup, 'fav_api')
#class FlickrFollowing(Following):
#def __init__(self):
#super(FlickrFollowing, self).__init__('flickr')
#self.params = {
#'method': 'flickr.contacts.getList',
#'api_key': shared.config.get('flickr', 'api_key'),
#'format': 'json',
#'nojsoncallback': '1',
#}
#def run(self):
#r = requests.get(self.url,params=self.params)
#js = json.loads(r.text)
#pprint(js)
#for contact in js.get('contacts', {}).get('contact', []):
#pprint(contact)
if __name__ == '__main__': if __name__ == '__main__':
while len(logging.root.handlers) > 0: while len(logging.root.handlers) > 0:
logging.root.removeHandler(logging.root.handlers[-1]) logging.root.removeHandler(logging.root.handlers[-1])
@ -297,3 +369,6 @@ if __name__ == '__main__':
fivehpx = FivehpxFavs() fivehpx = FivehpxFavs()
fivehpx.run() fivehpx.run()
#flickrfollow = FlickrFollowing()
#flickrfollow.run()

View file

@ -4,9 +4,11 @@ import re
import glob import glob
import logging import logging
import subprocess import subprocess
import json
from whoosh import fields from whoosh import fields
from whoosh import analysis from whoosh import analysis
from slugify import slugify
def __expandconfig(config): def __expandconfig(config):
""" add the dirs to the config automatically """ """ add the dirs to the config automatically """
@ -38,6 +40,8 @@ def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"):
).lstrip(numerals[0]) + numerals[num % b] ).lstrip(numerals[0]) + numerals[num % b]
) )
def slugfname(url):
return "%s" % slugify(re.sub(r"^https?://(?:www)?", "", url))[:200]
ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ' ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ'
STRFISO = '%Y-%m-%dT%H:%M:%S%z' STRFISO = '%Y-%m-%dT%H:%M:%S%z'
@ -104,6 +108,65 @@ config.read('config.ini')
config = __expandconfig(config) config = __expandconfig(config)
class TokenDB(object):
def __init__(self):
self.db = os.path.abspath(os.path.join(
config.get('common', 'basedir'),
'tokens.json'
))
self.tokens = {}
self.refresh()
def refresh(self):
if os.path.isfile(self.db):
with open(self.db, 'rt') as f:
self.tokens = json.loads(f.read())
def save(self):
with open(self.db, 'wt') as f:
f.write(
json.dumps(
self.tokens, indent=4, sort_keys=True
)
)
self.refresh()
def get_token(self, token):
return self.tokens.get(token, None)
def get_service(self, service):
s = self.tokens.get(service, None)
if s:
s = self.get_token(s)
return s
def set_service(self, service, token):
self.tokens.update({
service: token
})
#self.save()
def set_token(self, token, secret):
self.tokens.update({
token: {
'oauth_token': token,
'oauth_token_secret': secret
}
})
#self.save()
def set_verifier(self, token, verifier):
t = self.tokens.get(token)
t.update({
'verifier': verifier
})
self.tokens.update({
token: t
})
#self.save()
tokendb = TokenDB()
class CMDLine(object): class CMDLine(object):
def __init__(self, executable): def __init__(self, executable):
self.executable = self._which(executable) self.executable = self._which(executable)