From 70bd917de4c0595b510dd3a54b80424bf53e1564 Mon Sep 17 00:00:00 2001 From: Peter Molnar Date: Wed, 28 Jun 2017 11:20:26 +0000 Subject: [PATCH] updated --- nasg.py | 311 ++++++++++++++++++++++++++++++++++++++++++------------ new.py | 2 +- pesos.py | 75 +++++++++++++ shared.py | 65 +++++++++++- 4 files changed, 386 insertions(+), 67 deletions(-) mode change 100644 => 100755 new.py diff --git a/nasg.py b/nasg.py index 4683fa8..6f2b804 100755 --- a/nasg.py +++ b/nasg.py @@ -16,6 +16,9 @@ import math import asyncio import csv import getpass +import quopri +import base64 +import mimetypes import magic import arrow @@ -33,6 +36,7 @@ from webmentiontools.send import WebmentionSend from bleach import clean from emoji import UNICODE_EMOJI from bs4 import BeautifulSoup +from readability.readability import Document import shared def splitpath(path): @@ -89,7 +93,8 @@ class BaseRenderable(object): return - def writerendered(self, content): + def writerendered(self, content, mtime=None): + mtime = mtime or self.mtime d = os.path.dirname(self.target) if not os.path.isdir(d): os.mkdir(d) @@ -98,7 +103,7 @@ class BaseRenderable(object): logging.debug('writing %s', self.target) html.write(content) html.close() - os.utime(self.target, (self.mtime, self.mtime)) + os.utime(self.target, (mtime, mtime)) class Indexer(object): @@ -197,14 +202,25 @@ class Indexer(object): self.writer.commit() -class OfflineCopy(object): - def __init__(self, url): +class OfflineArchive(object): + # keep in mind that these are frontmattered HTML files with full HTML and embedded images + # they can get VERY large + def __init__(self, url, content=None, decode_email=False): self.url = url - self.fname = "%s.md" % slugify(re.sub(r"^https?://", "", url))[:200] + self.parsed = urllib.parse.urlparse(url) + self.fbase = shared.slugfname(url) + self.fname = "%s.md" % self.fbase self.target = os.path.join( shared.config.get('source', 'offlinecopiesdir'), self.fname ) + self.targetd = os.path.join( + shared.config.get('source', 'offlinecopiesdir'), + self.fbase + ) + if not os.path.isdir(self.targetd): + os.mkdir(self.targetd) + self.fm = frontmatter.loads('') self.fm.metadata = { 'url': self.url, @@ -215,36 +231,152 @@ class OfflineCopy(object): 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', }) - def __repr__(self): - return self.fm.content + self.skip_fetch = False + if content: + self.skip_fetch = True + if decode_email: + content = quopri.decodestring(content) + content = str(content, 'utf-8', errors='replace') + self.fm.content = content + #self.tmp = tempfile.mkdtemp( + #'offlinearchive_', + #dir=tempfile.gettempdir() + #) + #atexit.register( + #shutil.rmtree, + #os.path.abspath(self.tmp) + #) + #self.images = [] - def write(self): + self.exists = os.path.isfile(self.target) + + def _getimage(self, src): + imgname, imgext = os.path.splitext(os.path.basename(src)) + imgtarget = os.path.join( + self.targetd, + "%s%s" % (slugify(imgname, only_ascii=True, lower=True), imgext) + ) + try: + logging.debug('donwloading image %s', src) + r = requests.get( + src, + allow_redirects=True, + timeout=60, + stream=True + ) + with open(imgtarget, 'wb') as f: + for chunk in r.iter_content(): + if chunk: + f.write(chunk) + + self.fm.content = self.fm.content.replace( + src, + '%s/%s' % (self.fbase, imgname) + ) + except Exception as e: + logging.error('pulling image %s failed: %s', src, e) + return + + def _get_images(self): + logging.debug("trying to save images") + soup = BeautifulSoup(self.fm.content, 'lxml') + + embedded = re.compile(r'^data:.*') + for img in soup.find_all('img'): + src = img.get('src') + if not src: + continue + if embedded.match(src): + continue + + im = urllib.parse.urlparse(src) + if not im.scheme: + im = im._replace(scheme=self.parsed.scheme) + if not im.netloc: + im = im._replace(netloc=self.parsed.netloc) + + self._getimage(im.geturl()) + + + #def _getimage(self, src): + #tmp = os.path.join(self.tmp, "%s" % slugify(os.path.basename(src))[:200]) + #try: + #r = requests.get( + #src, + #allow_redirects=True, + #timeout=60, + #stream=True + #) + #with open(tmp, 'wb') as f: + #for chunk in r.iter_content(): + #if chunk: + #f.write(chunk) + + #logging.debug('trying to embed %s', src) + #with open(tmp, 'rb') as imgdata: + #data = str(base64.b64encode(imgdata.read()), 'ascii') + #mimetype, encoding = mimetypes.guess_type(tmp) + #self.fm.content = self.fm.content.replace( + #src, + #"data:%s;base64,%s" % (mimetype, data) + #) + #except Exception as e: + #logging.error('pulling image %s failed: %s', src, e) + #return + + #def _embed_images(self): + #logging.debug("trying to embed images") + #soup = BeautifulSoup(self.fm.content, 'lxml') + + #embedded = re.compile(r'^data:.*') + #for img in soup.find_all('img'): + #src = img.get('src') + #if not src: + #continue + #if embedded.match(src): + #continue + + #im = urllib.parse.urlparse(src) + #if not im.scheme: + #im = im._replace(scheme=self.parsed.scheme) + #if not im.netloc: + #im = im._replace(netloc=self.parsed.netloc) + + #self._getimage(im.geturl()) + + + def save(self): logging.info( "savig offline copy of\n\t%s to:\n\t%s", self.url, self.target ) + with open(self.target, 'wt') as f: f.write(frontmatter.dumps(self.fm)) - @property def archiveorgurl(self): + logging.debug("trying archive.org for %s", self.url) a = self.fetch( "http://archive.org/wayback/available?url=%s" % self.url, ) if not a: + logging.debug("no entry for %s on archive.org", self.url) return None try: a = json.loads(a.text) - return a.get( + aurl = a.get( 'archived_snapshots', {} ).get( 'closest', {} ).get( 'url', None ) + logging.debug("found %s in archive.org for %s", aurl, self.url) + self.updateurl(aurl) + return self.fetch(aurl) except Exception as e: logging.error("archive.org parsing failed: %s", e) return None @@ -264,24 +396,40 @@ class OfflineCopy(object): return None - def run(self): + def read(): if os.path.isfile(self.target): with open(self.target) as f: self.fm = frontmatter.loads(f.read()) return - logging.info("prepairing offline copy of %s", self.url) - r = self.fetch(self.url) - if not r: - r = self.fetch(self.archiveorgurl) - if r: - if r.url != self.url: - self.fm.metadata['realurl'] = r.url + def run(self): + if self.exists: + logging.info("offline archive for %s already exists", self.url) + return + + logging.info("prepairing offline copy of %s", self.url) + + if not self.skip_fetch: + r = self.fetch(self.url) + + # in case it's not, try to look for an archive.org url: + if not r: + logging.warning("couldn't get live version of %s, trying archive.org", self.url) + r = self.fetch(self.archiveorgurl) + + # no live and no archive.org entry :(( + # howver, by miracle, I may already have a copy, so skip if it's there already + if not r: + logging.error("no live or archive version of %s found :((", self.url) + if not self.exists: + self.save() + return + self.fm.content = r.text - self.write() - return + self._get_images() + self.save() class Renderer(object): @@ -302,9 +450,10 @@ class Renderer(object): @staticmethod def jinja_filter_date(d, form='%Y-%m-%d %H:%m:%S'): if d == 'now': - return arrow.now().strftime(form) + d = arrow.now().datetime if form == 'c': - form = '%Y-%m-%dT%H:%M:%S%z' + return d.isoformat() + #form = '%Y-%m-%dT%H:%M:%S%z' return d.strftime(form) @@ -422,7 +571,7 @@ class Comment(BaseRenderable): 'content': self.content, 'html': self.html, 'source': self.source, - 'target': self.target, + 'target': self.targeturl, 'type': self.meta.get('type', 'webmention'), 'reacji': self.reacji, 'fname': self.fname @@ -456,34 +605,43 @@ class Comment(BaseRenderable): return self._source + @property + def targeturl(self): + if hasattr(self, '_targeturl'): + return self._targeturl + t = self.meta.get('target', shared.config.get('site', 'url')) + self._targeturl = '{p.path}'.format(p=urllib.parse.urlparse(t)).strip('/') + return self._targeturl + @property def target(self): if hasattr(self, '_target'): return self._target - t = self.meta.get('target', shared.config.get('site', 'url')) - self._target = '{p.path}'.format(p=urllib.parse.urlparse(t)).strip('/') - return self._target - - async def render(self, renderer): - logging.info("rendering and saving comment %s", self.fname) targetdir = os.path.abspath(os.path.join( shared.config.get('target', 'builddir'), shared.config.get('site', 'commentspath'), self.fname )) - target = os.path.join(targetdir, 'index.html') - if not shared.config.getboolean('params', 'force') and os.path.isfile(target): - ttime = int(os.path.getmtime(target)) + self._target = os.path.join(targetdir, 'index.html') + return self._target + + + async def render(self, renderer): + logging.info("rendering and saving comment %s", self.fname) + + if not shared.config.getboolean('params', 'force') and os.path.isfile(self.target): + ttime = int(os.path.getmtime(self.target)) logging.debug('ttime is %d mtime is %d', ttime, self.mtime) if ttime == self.mtime: - logging.debug('%s exists and up-to-date (lastmod: %d)', target, ttime) + logging.debug( + '%s exists and up-to-date (lastmod: %d)', + self.target, + ttime + ) return - #if not os.path.isdir(targetdir): - #os.mkdir(targetdir) - tmplvars = { 'reply': self.tmplvars, 'site': renderer.sitevars, @@ -719,7 +877,8 @@ class WebImage(object): self._rssenclosure = { 'mime': magic.Magic(mime=True).from_file(target['fpath']), 'url': target['url'], - 'size': os.path.getsize(target['fpath']) + 'size': os.path.getsize(target['fpath']), + 'fname': self.fname } return self._rssenclosure @@ -976,8 +1135,8 @@ class Taxonomy(BaseIter): async def render(self, renderer): - if not self.slug or self.slug is 'None': - return + #if not self.slug or self.slug is 'None': + #return self.__mkdirs() page = 1 @@ -1031,24 +1190,20 @@ class Taxonomy(BaseIter): os.utime(target, (self.mtime, self.mtime)) if 1 == page: - target = os.path.join(self.feedp, 'index.rss') - logging.info("rendering RSS feed to %s", target) - r = renderer.j2.get_template('rss.html').render(tmplvars) + #target = os.path.join(self.feedp, 'index.rss') + #logging.info("rendering RSS feed to %s", target) + #r = renderer.j2.get_template('rss.html').render(tmplvars) + #with open(target, "wt") as html: + #html.write(r) + #os.utime(target, (self.mtime, self.mtime)) + + target = os.path.join(self.feedp, 'index.atom') + logging.info("rendering Atom feed to %s", target) + r = renderer.j2.get_template('atom.html').render(tmplvars) with open(target, "wt") as html: html.write(r) os.utime(target, (self.mtime, self.mtime)) - if not self.taxonomy or self.taxonomy == 'category': - t = shared.config.get('site', 'websuburl') - data = { - 'hub.mode': 'publish', - 'hub.url': "%s%s" % ( - shared.config.get('site', 'url'), self.baseurl - ) - } - logging.info("pinging %s with data %s", t, data) - requests.post(t, data=data) - # --- # this is a joke # see http://indieweb.org/YAMLFeed @@ -1081,6 +1236,18 @@ class Taxonomy(BaseIter): os.utime(target, (self.mtime, self.mtime)) # --- + if 1 == page: + if not self.taxonomy or self.taxonomy == 'category': + t = shared.config.get('site', 'websuburl') + data = { + 'hub.mode': 'publish', + 'hub.url': "%s%s" % ( + shared.config.get('site', 'url'), self.baseurl + ) + } + logging.info("pinging %s with data %s", t, data) + requests.post(t, data=data) + class Content(BaseIter): def __init__(self, images, comments, extensions=['md']): @@ -1557,7 +1724,7 @@ class Singular(BaseRenderable): if not isinstance(maybe, list): maybe = [maybe] for url in maybe: - copies[url] = OfflineCopy(url) + copies[url] = OfflineArchive(url) copies[url].run() self.copies = copies @@ -1601,7 +1768,8 @@ class Singular(BaseRenderable): 'slug': self.fname, 'shortslug': self.shortslug, 'rssenclosure': self.rssenclosure, - 'copies': self.offlinecopies, + #'copies': self.offlinecopies, + 'copies': [], 'comments': self.comments, 'replies': self.replies, 'reacjis': self.reacjis, @@ -1617,6 +1785,15 @@ class Singular(BaseRenderable): return self._shortslug + @property + def target(self): + targetdir = os.path.abspath(os.path.join( + shared.config.get('target', 'builddir'), + self.fname + )) + return os.path.join(targetdir, 'index.html') + + async def rendercomments(self, renderer): for comment in self.comments: await comment.render(renderer) @@ -1638,17 +1815,15 @@ class Singular(BaseRenderable): mtime = lctime logging.info("rendering and saving %s", self.fname) - targetdir = os.path.abspath(os.path.join( - shared.config.get('target', 'builddir'), - self.fname - )) - target = os.path.join(targetdir, 'index.html') - - if not shared.config.getboolean('params', 'force') and os.path.isfile(target): - ttime = int(os.path.getmtime(target)) + if not shared.config.getboolean('params', 'force') and os.path.isfile(self.target): + ttime = int(os.path.getmtime(self.target)) logging.debug('ttime is %d mtime is %d', ttime, mtime) if ttime == mtime: - logging.debug('%s exists and up-to-date (lastmod: %d)', target, ttime) + logging.debug( + '%s exists and up-to-date (lastmod: %d)', + self.target, + ttime + ) return tmplvars = { @@ -1657,7 +1832,7 @@ class Singular(BaseRenderable): 'taxonomy': {}, } r = renderer.j2.get_template(self.tmplfile).render(tmplvars) - self.writerendered(target, r, mtime) + self.writerendered(r, mtime) async def ping(self, pinger): @@ -1746,6 +1921,12 @@ class NASG(object): default=False, help='skip rendering' ) + parser.add_argument( + '--refetch', + action='store_true', + default=False, + help='force re-fetching offline archives' + ) params = vars(parser.parse_args()) shared.config.add_section('params') diff --git a/new.py b/new.py old mode 100644 new mode 100755 index 93b040a..2b994bf --- a/new.py +++ b/new.py @@ -119,7 +119,7 @@ if __name__ == '__main__': doc.content = content tmpsave = os.path.join(tempfile.gettempdir(), "%s.md" % slug) - saveto = input('Save to: [%s]: ' % categories) or tmpsave + saveto = input('Save to: [%s]: ' % categories) or 'bookmark' if tmpsave != saveto: saveto = os.path.join(shared.config.get('source', 'contentdir'), saveto, "%s.md" % slug) diff --git a/pesos.py b/pesos.py index 2a5fe0e..bff3338 100644 --- a/pesos.py +++ b/pesos.py @@ -14,6 +14,22 @@ from slugify import slugify from pprint import pprint +""" TODO + +- following from: + - tumblr + - deviantart + - flickr + - wordpress.com + - twitter + - 500px + + +""" + + + + class Bookmark(object): def __init__(self, title, url, fname=None): self.fm = frontmatter.loads('') @@ -126,6 +142,37 @@ class Fav(object): os.utime(self.target, (self.arrow.timestamp, self.arrow.timestamp)) +class PinterestFav(Fav): + def __init__(self, url): + super(PinterestFav, self).__init__() + self.url = url + self.fname = "pinterest-%s.md" % (list(filter(None, url.split('/')))[-1]) + + def run(self): + try: + r = requests.get(self.url) + soup = bs4.BeautifulSoup(r.text, 'lxml') + ld = json.loads(soup.find('script', type='application/ld+json').text) + imgurl = ld.get('image') + self.saveimg(imgurl) + + self.fm.metadata = { + 'published': arrow.get( + ld.get('datePublished', arrow.utcnow().timestamp) + ).format(shared.ARROWISO), + 'title': ld.get('headline', self.url), + 'favorite-of': self.url, + 'image': self.imgname + } + content = ld.get('articleBody', '') + content = shared.Pandoc(False).convert(content) + self.fm.content = content + + except Exception as e: + logging.error('saving pinterest fav %s failed: %s', self.url, e) + return + + class FlickrFav(Fav): def __init__(self, photo): super(FlickrFav, self).__init__() @@ -280,6 +327,31 @@ class FivehpxFavs(Favs): fav.write() +#class Following(object): + #def __init__(self, confgroup): + #self.confgroup = confgroup + #self.url = shared.config.get(confgroup, 'fav_api') + + +#class FlickrFollowing(Following): + #def __init__(self): + #super(FlickrFollowing, self).__init__('flickr') + #self.params = { + #'method': 'flickr.contacts.getList', + #'api_key': shared.config.get('flickr', 'api_key'), + #'format': 'json', + #'nojsoncallback': '1', + #} + + #def run(self): + #r = requests.get(self.url,params=self.params) + #js = json.loads(r.text) + #pprint(js) + #for contact in js.get('contacts', {}).get('contact', []): + #pprint(contact) + + + if __name__ == '__main__': while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) @@ -297,3 +369,6 @@ if __name__ == '__main__': fivehpx = FivehpxFavs() fivehpx.run() + + #flickrfollow = FlickrFollowing() + #flickrfollow.run() diff --git a/shared.py b/shared.py index ab8be86..6cea144 100644 --- a/shared.py +++ b/shared.py @@ -4,9 +4,11 @@ import re import glob import logging import subprocess +import json + from whoosh import fields from whoosh import analysis - +from slugify import slugify def __expandconfig(config): """ add the dirs to the config automatically """ @@ -38,6 +40,8 @@ def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"): ).lstrip(numerals[0]) + numerals[num % b] ) +def slugfname(url): + return "%s" % slugify(re.sub(r"^https?://(?:www)?", "", url))[:200] ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ' STRFISO = '%Y-%m-%dT%H:%M:%S%z' @@ -104,6 +108,65 @@ config.read('config.ini') config = __expandconfig(config) +class TokenDB(object): + def __init__(self): + self.db = os.path.abspath(os.path.join( + config.get('common', 'basedir'), + 'tokens.json' + )) + self.tokens = {} + self.refresh() + + def refresh(self): + if os.path.isfile(self.db): + with open(self.db, 'rt') as f: + self.tokens = json.loads(f.read()) + + def save(self): + with open(self.db, 'wt') as f: + f.write( + json.dumps( + self.tokens, indent=4, sort_keys=True + ) + ) + self.refresh() + + def get_token(self, token): + return self.tokens.get(token, None) + + def get_service(self, service): + s = self.tokens.get(service, None) + if s: + s = self.get_token(s) + return s + + def set_service(self, service, token): + self.tokens.update({ + service: token + }) + #self.save() + + def set_token(self, token, secret): + self.tokens.update({ + token: { + 'oauth_token': token, + 'oauth_token_secret': secret + } + }) + #self.save() + + def set_verifier(self, token, verifier): + t = self.tokens.get(token) + t.update({ + 'verifier': verifier + }) + self.tokens.update({ + token: t + }) + #self.save() + +tokendb = TokenDB() + class CMDLine(object): def __init__(self, executable): self.executable = self._which(executable)