diff --git a/nasg.py b/nasg.py index 0f3b4e3..fbd72b1 100755 --- a/nasg.py +++ b/nasg.py @@ -25,7 +25,8 @@ import frontmatter from slugify import slugify import langdetect import requests -from breadability.readable import Article +#from breadability.readable import Article +from newspaper import Article as newspaper3k from whoosh import index from whoosh import qparser import jinja2 @@ -34,6 +35,7 @@ import shared from webmentiontools.send import WebmentionSend from bleach import clean from emoji import UNICODE_EMOJI +from bs4 import BeautifulSoup def splitpath(path): parts = [] @@ -114,8 +116,8 @@ class Indexer(object): ] content_remote = [] - for url, offlinecopy in singular.offlinecopies.items(): - content_remote.append("%s" % offlinecopy) + #for url, offlinecopy in singular.offlinecopies.items(): + #content_remote.append("%s" % offlinecopy) weight = 1 if singular.isbookmark: @@ -154,15 +156,13 @@ class Indexer(object): def finish(self): self.writer.commit() + class OfflineCopy(object): def __init__(self, url): self.url = url - self.fname = hashlib.sha1(url.encode('utf-8')).hexdigest() - self.targetdir = os.path.abspath( - shared.config.get('source', 'offlinecopiesdir') - ) + self.fname = "%s.md" % slugify(re.sub(r"^https?://", "", url))[:200] self.target = os.path.join( - self.targetdir, + shared.config.get('source', 'offlinecopiesdir'), self.fname ) self.fm = frontmatter.loads('') @@ -170,6 +170,10 @@ class OfflineCopy(object): 'url': self.url, 'date': arrow.utcnow().format("YYYY-MM-DDTHH:mm:ssZ"), } + self.headers = requests.utils.default_headers() + self.headers.update({ + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', + }) def __repr__(self): return self.fm.content @@ -183,6 +187,42 @@ class OfflineCopy(object): with open(self.target, 'wt') as f: f.write(frontmatter.dumps(self.fm)) + @property + def archiveorgurl(self): + a = self.fetch( + "http://archive.org/wayback/available?url=%s" % self.url, + ) + if not a: + return None + + try: + a = json.loads(a.text) + return a.get( + 'archived_snapshots', {} + ).get( + 'closest', {} + ).get( + 'url', None + ) + except Exception as e: + logging.error("archive.org parsing failed: %s", e) + return None + + def fetch(self, url): + try: + r = requests.get( + self.url, + allow_redirects=True, + timeout=60, + headers=self.headers + ) + if r.status_code == requests.codes.ok: + return r + except Exception as e: + return None + + + def run(self): if os.path.isfile(self.target): with open(self.target) as f: @@ -190,39 +230,17 @@ class OfflineCopy(object): return logging.info("prepairing offline copy of %s", self.url) - headers = requests.utils.default_headers() - headers.update({ - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' - }) + r = self.fetch(self.url) + if not r: + r = self.fetch(self.archiveorgurl) - try: - r = requests.get( - self.url, - allow_redirects=True, - timeout=60, - headers=headers - ) - except Exception as e: - logging.error("%s failed:\n%s", self.url, e) - self.write() - return + if r: + if r.url != self.url: + self.fm.metadata['realurl'] = r.url + self.fm.content = r.text - if r.status_code != requests.codes.ok: - logging.warning("%s returned %s", self.url, r.status_code) - self.write() - return - - if not len(r.text): - logging.warning("%s was empty", self.url) - self.write() - return - - doc = Article(r.text, url=self.url) - self.fm.metadata['title'] = doc._original_document.title - self.fm.metadata['realurl'] = r.url - self.fm.content = shared.Pandoc(False).convert(doc.readable) self.write() + return class Renderer(object): @@ -551,7 +569,7 @@ class WebImage(object): self.alttext = '' self.sizes = [] self.fallbacksize = int(shared.config.get('common','fallbackimg', fallback='720')) - self.cl = None + self.cl = '' self.singleimage = False for size in shared.config.options('downsize'): @@ -587,35 +605,118 @@ class WebImage(object): ) def __str__(self): - if self.is_downsizeable and not self.cl: - uphoto = '' - if self.singleimage: - uphoto = ' u-photo' - return '\n
%s
%s%s
\n' % ( - uphoto, + if self.is_downsizeable: + if self.singleimage and not self.cl: + self.cl = '.u-photo' + elif self.singleimage: + self.cl = '.u-photo %s' % self.cl + + return '[![%s](%s "%s%s"){.adaptimg}](%s){.adaptive %s}' % ( + self.alttext, + self.fallback, + self.fname, + self.ext, self.target, - self.fallback, - self.alttext, - self.fname, - self.ext + self.cl ) - elif self.cl: - self.cl = self.cl.replace('.', ' ') - return '%s' % ( - self.fallback, - self.cl, + else: + if not self.cl: + self.cl = '.aligncenter' + return '![%s](%s "%s%s"){%s}' % ( self.alttext, + self.fallback, self.fname, - self.ext + self.ext, + self.cl ) - else: - return '%s' % ( - self.fallback, - self.alttext, - self.fname, - self.ext - ) + @property + def exif(self): + if not self.is_photo: + return {} + + if hasattr(self, '_exif'): + return self._exif + + exif = {} + mapping = { + 'camera': [ + 'Model' + ], + 'aperture': [ + 'FNumber', + 'Aperture' + ], + 'shutter_speed': [ + 'ExposureTime' + ], + 'focallength35mm': [ + 'FocalLengthIn35mmFormat', + ], + 'focallength': [ + 'FocalLength', + ], + 'iso': [ + 'ISO' + ], + 'lens': [ + 'LensID', + ], + 'date': [ + 'CreateDate', + 'DateTimeOriginal', + ], + 'geo_latitude': [ + 'GPSLatitude' + ], + 'geo_longitude': [ + 'GPSLongitude' + ], + } + + for ekey, candidates in mapping.items(): + for candidate in candidates: + maybe = self.meta.get(candidate, None) + if maybe: + if 'geo_' in ekey: + exif[ekey] = round(float(maybe), 5) + else: + exif[ekey] = maybe + break + + self._exif = exif + return self._exif + + #def __str__(self): + #if self.is_downsizeable and not self.cl: + #uphoto = '' + #if self.singleimage: + #uphoto = ' u-photo' + #return '\n
%s
%s%s
\n' % ( + #uphoto, + #self.target, + #self.fallback, + #self.alttext, + #self.fname, + #self.ext + #) + #elif self.cl: + #self.cl = self.cl.replace('.', ' ') + #return '%s' % ( + #self.fallback, + #self.cl, + #self.alttext, + #self.fname, + #self.ext + #) + + #else: + #return '%s' % ( + #self.fallback, + #self.alttext, + #self.fname, + #self.ext + #) @property def rssenclosure(self): @@ -869,6 +970,9 @@ class Taxonomy(BaseIter): return "%s/%d/index.html" % (self.pagep, page) async def render(self, renderer): + if not self.slug or self.slug is 'None': + return + self.__mkdirs() page = 1 testpath = self.tpath(page) @@ -907,7 +1011,8 @@ class Taxonomy(BaseIter): 'taxonomy': self.taxonomy, 'paged': page, 'total': self.pages, - 'perpage': pagination + 'perpage': pagination, + 'lastmod': arrow.get(self.mtime).datetime }, 'site': renderer.sitevars, 'posts': posttmpls, @@ -1100,12 +1205,41 @@ class Singular(BaseRenderable): def __parse(self): with open(self.path, mode='rt') as f: self.meta, self.content = frontmatter.parse(f.read()) - self.__filter_images() + self.__filter_favs() + self.__filter_images() if self.isphoto: self.content = "%s\n%s" % ( self.content, self.photo ) + # REMOVE THIS + trigger = self.offlinecopies + + def __filter_favs(self): + url = self.meta.get('favorite-of', + self.meta.get('like-of', + self.meta.get('bookmark-of', + False + ) + ) + ) + img = self.meta.get('image', False) + if not img: + return + if not url: + return + + c = '[![%s](/%s/%s)](%s){.favurl}' % ( + self.title, + shared.config.get('source', 'files'), + img, + url + ) + + if self.isbookmark: + c = "%s\n\n%s" % (c, self.content) + + self.content = c def __filter_images(self): linkto = False @@ -1191,6 +1325,8 @@ class Singular(BaseRenderable): 'bookmark-of': 'bookmark', 'repost-of': 'repost', 'in-reply-to': 'reply', + 'favorite-of': 'fav', + 'like-of': 'like', } reactions = {} @@ -1281,6 +1417,25 @@ class Singular(BaseRenderable): def isbookmark(self): return self.meta.get('bookmark-of', False) + @property + def isreply(self): + return self.meta.get('in-reply-to', False) + + # TODO + #@property + #def isrvsp(self): + # r'([^<]+)' + + @property + def isfav(self): + r = False + for maybe in ['like-of', 'favorite-of']: + maybe = self.meta.get(maybe, False) + if maybe: + r = maybe + break + return r + @property def ispage(self): if not self.meta: @@ -1289,7 +1444,11 @@ class Singular(BaseRenderable): @property def isonfront(self): - if self.ispage or self.isbookmark: + if self.ispage: + return False + if self.isbookmark: + return False + if self.isfav: return False return True @@ -1366,59 +1525,9 @@ class Singular(BaseRenderable): @property def exif(self): if not self.isphoto: - return None + return {} - if hasattr(self, '_exif'): - return self._exif - - exif = {} - mapping = { - 'camera': [ - 'Model' - ], - 'aperture': [ - 'FNumber', - 'Aperture' - ], - 'shutter_speed': [ - 'ExposureTime' - ], - 'focallength35mm': [ - 'FocalLengthIn35mmFormat', - ], - 'focallength': [ - 'FocalLength', - ], - 'iso': [ - 'ISO' - ], - 'lens': [ - 'LensID', - ], - 'date': [ - 'CreateDate', - 'DateTimeOriginal', - ], - 'geo_latitude': [ - 'GPSLatitude' - ], - 'geo_longitude': [ - 'GPSLongitude' - ], - } - - for ekey, candidates in mapping.items(): - for candidate in candidates: - maybe = self.photo.meta.get(candidate, None) - if maybe: - if 'geo_' in ekey: - exif[ekey] = round(float(maybe), 5) - else: - exif[ekey] = maybe - break - - self._exif = exif - return self._exif + return self.photo.exif @property def rssenclosure(self): @@ -1441,7 +1550,8 @@ class Singular(BaseRenderable): 'category': self.category, 'reactions': self.reactions, 'updated': self.updated.datetime, - 'summary': self.sumhtml, + 'summary': self.summary, + 'sumhtml': self.sumhtml, 'exif': self.exif, 'lang': self.lang, 'syndicate': '', @@ -1459,21 +1569,9 @@ class Singular(BaseRenderable): def shortslug(self): if hasattr(self, '_shortslug'): return self._shortslug - self._shortslug = self.baseN(self.pubtime) + self._shortslug = shared.baseN(self.pubtime) return self._shortslug - @staticmethod - def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"): - """ Used to create short, lowecase slug for a number (an epoch) passed """ - num = int(num) - return ((num == 0) and numerals[0]) or ( - Singular.baseN( - num // b, - b, - numerals - ).lstrip(numerals[0]) + numerals[num % b] - ) - async def rendercomments(self, renderer): for comment in self.comments: await comment.render(renderer) @@ -1507,9 +1605,6 @@ class Singular(BaseRenderable): logging.debug('%s exists and up-to-date (lastmod: %d)', target, ttime) return - #if not os.path.isdir(targetdir): - #os.mkdir(targetdir) - tmplvars = { 'post': self.tmplvars, 'site': renderer.sitevars, @@ -1517,11 +1612,6 @@ class Singular(BaseRenderable): } r = renderer.j2.get_template(self.tmplfile).render(tmplvars) self.writerendered(target, r, mtime) - #with open(target, "w") as html: - #logging.debug('writing %s', target) - #html.write(r) - #html.close() - #os.utime(target, (mtime, mtime)) async def ping(self, pinger): @@ -1542,7 +1632,11 @@ class Singular(BaseRenderable): logging.info("sending webmention from %s to %s", self.url, target) ws = WebmentionSend(self.url, target) - ws.send(allow_redirects=True, timeout=30) + try: + ws.send(allow_redirects=True, timeout=30) + except Exception as e: + logging.error('ping failed to %s', target) + pinger.db[h] = record class Webmentioner(object): diff --git a/new.py b/new.py index a9c3065..93b040a 100644 --- a/new.py +++ b/new.py @@ -9,8 +9,6 @@ import glob import sys import tempfile from slugify import slugify - -import nasg import shared if __name__ == '__main__': @@ -78,7 +76,7 @@ if __name__ == '__main__': elif args['repost']: slug = slugify("re: %s" % (args['repost']), only_ascii=True, lower=True) else: - slug = nasg.Singular.baseN(now.timestamp) + slug = shared.baseN(now.timestamp) args['slug'] = input('Slug [%s]: ' % (slug)) or slug if args['slug'] in slugs: diff --git a/pesos.py b/pesos.py new file mode 100644 index 0000000..86402df --- /dev/null +++ b/pesos.py @@ -0,0 +1,302 @@ +import json +import os +import hashlib +import glob +import frontmatter +import requests +import shared +import logging +import re +import shutil +import arrow +import bs4 +from slugify import slugify + +from pprint import pprint + +class Bookmark(object): + def __init__(self, title, url, fname=None): + self.fm = frontmatter.loads('') + fname = fname or slugify(title) + self.fname = "%s.md" % fname + self.target = os.path.join( + shared.config.get('source', 'contentdir'), + shared.config.get('source', 'bookmarks'), + self.fname + ) + self.fm.metadata = { + 'published': arrow.utcnow().format(shared.ARROWISO), + 'title': title, + 'bookmark-of': url, + } + + def write(self): + logging.info('saving bookmark to %s', self.target) + with open(self.target, 'wt') as t: + t.write(frontmatter.dumps(self.fm)) + +class HNBookmarks(object): + prefix = 'hn-' + def __init__(self): + self.url = 'https://news.ycombinator.com/favorites?id=%s' % ( + shared.config.get('hackernews', 'user_id') + ) + + @property + def existing(self): + if hasattr(self, '_existing'): + return self._existing + + d = os.path.join( + shared.config.get('source', 'contentdir'), + "*", + "%s*.md" % self.prefix + ) + files = reversed(sorted(glob.glob(d))) + self._existing = [ + os.path.basename(f.replace(self.prefix, '').replace('.md', '')) + for f in files + ] + + return self._existing + + def run(self): + r = requests.get(self.url) + soup = bs4.BeautifulSoup(r.text, "html5lib") + rows = soup.find_all('tr', attrs={'class':'athing' }) + for row in rows: + rid = row.get('id') + if rid in self.existing: + continue + + link = row.find('a', attrs={'class':'storylink' }) + url = link.get('href') + title = " ".join(link.contents) + fname = "%s%s" % (self.prefix, rid) + + bookmark = Bookmark(title, url, fname) + bookmark.write() + +class Fav(object): + def __init__(self): + self.arrow = arrow.utcnow() + self.fm = frontmatter.loads('') + + @property + def target(self): + return os.path.join( + shared.config.get('source', 'contentdir'), + shared.config.get('source', 'favs'), + self.fname + ) + + @property + def exists(self): + return False + #return os.path.isfile(self.target) + + @property + def imgname(self): + # the _ is to differentiate between my photos, where the md and jpg name is the same, and favs + return self.fname.replace('.md', '_.jpg') + + @property + def imgtarget(self): + return os.path.join( + shared.config.get('source', 'filesdir'), + self.imgname + ) + + def saveimg(self, url): + target = self.imgtarget + if os.path.isfile(target): + logging.error("%s already exists, refusing to overwrite", target) + return + + logging.info("pulling image %s to files", url) + r = requests.get(url, stream=True) + if r.status_code == 200: + with open(target, 'wb') as f: + r.raw.decode_content = True + shutil.copyfileobj(r.raw, f) + + def write(self): + logging.info('saving fav to %s', self.target) + with open(self.target, 'wt') as t: + t.write(frontmatter.dumps(self.fm)) + os.utime(self.target, (self.arrow.timestamp, self.arrow.timestamp)) + + +class FlickrFav(Fav): + def __init__(self, photo): + super(FlickrFav, self).__init__() + self.photo = photo + self.ownerid = photo.get('owner') + self.photoid = photo.get('id') + self.fname = "flickr-%s-%s.md" % (self.ownerid, self.photoid) + self.url = "https://www.flickr.com/photos/%s/%s" % (self.ownerid, self.photoid) + + def run(self): + img = self.photo.get('url_b', self.photo.get('url_z', False)) + if not img: + logging.error("image url was empty for %s, skipping fav", self.url) + return + + self.saveimg(img) + self.arrow = arrow.get( + self.photo.get('date_faved', arrow.utcnow().timestamp) + ) + self.fm.metadata = { + 'published': self.arrow.format(shared.ARROWISO), + 'title': '%s' % self.photo.get('title', self.fname), + 'favorite-of': self.url, + 'flickr_tags': self.photo.get('tags', '').split(' '), + 'geo': { + 'latitude': self.photo.get('latitude', ''), + 'longitude': self.photo.get('longitude', ''), + }, + 'author': { + 'name': self.photo.get('owner_name'), + 'url': 'https://www.flickr.com/people/%s' % ( + self.photo.get('owner') + ), + }, + 'image': self.imgname + } + + content = self.photo.get('description', {}).get('_content', '') + content = shared.Pandoc(False).convert(content) + self.fm.content = content + + +class FivehpxFav(Fav): + def __init__(self, photo): + super(FivehpxFav, self).__init__() + self.photo = photo + self.ownerid = photo.get('user_id') + self.photoid = photo.get('id') + self.fname = "500px-%s-%s.md" % (self.ownerid, self.photoid) + self.url = "https://www.500px.com%s" % (photo.get('url')) + + def run(self): + img = self.photo.get('images')[0].get('url') + if not img: + logging.error("image url was empty for %s, skipping fav", self.url) + return + + self.saveimg(img) + self.arrow = arrow.get( + self.photo.get('created_at', arrow.utcnow().timestamp) + ) + self.fm.metadata = { + 'published': self.arrow.format(shared.ARROWISO), + 'title': '%s' % self.photo.get('name', self.fname), + 'favorite-of': self.url, + 'fivehpx_tags': self.photo.get('tags', []), + 'geo': { + 'latitude': self.photo.get('latitude', ''), + 'longitude': self.photo.get('longitude', ''), + }, + 'author': { + 'name': self.photo.get('user').get('fullname', self.ownerid), + 'url': 'https://www.500px.com/%s' % ( + self.photo.get('user').get('username', self.ownerid) + ), + }, + 'image': self.imgname + } + + content = self.photo.get('description', '') + if content: + content = shared.Pandoc(False).convert(content) + else: + content = '' + self.fm.content = content + +class Favs(object): + def __init__(self, confgroup): + self.confgroup = confgroup + self.url = shared.config.get(confgroup, 'fav_api') + + @property + def lastpulled(self): + return 0 + + mtime = 0 + d = os.path.join( + shared.config.get('source', 'contentdir'), + shared.config.get('source', 'favs'), + "%s-*.md" % self.confgroup + ) + files = glob.glob(d) + for f in files: + ftime = int(os.path.getmtime(f)) + if ftime > mtime: + mtime = ftime + + mtime = mtime + 1 + logging.debug("last flickr fav timestamp: %s", mtime) + return mtime + + +class FlickrFavs(Favs): + def __init__(self): + super(FlickrFavs, self).__init__('flickr') + self.params = { + 'method': 'flickr.favorites.getList', + 'api_key': shared.config.get('flickr', 'api_key'), + 'user_id': shared.config.get('flickr', 'user_id'), + 'extras': 'description,geo,tags,url_z,url_b,owner_name,date_upload', + 'per_page': 500, + 'format': 'json', + 'nojsoncallback': '1', + 'min_fave_date': self.lastpulled + } + + def run(self): + r = requests.get(self.url,params=self.params) + js = json.loads(r.text) + for photo in js.get('photos', {}).get('photo', []): + fav = FlickrFav(photo) + fav.run() + fav.write() + + +class FivehpxFavs(Favs): + def __init__(self): + super(FivehpxFavs, self).__init__('500px') + self.params = { + 'consumer_key': shared.config.get('500px', 'api_key'), + 'rpp': 100, + 'image_size': 4, + 'include_tags': 1, + 'include_geo': 1 + } + + def run(self): + r = requests.get(self.url,params=self.params) + js = json.loads(r.text) + for photo in js.get('photos', []): + fav = FivehpxFav(photo) + if not fav.exists: + fav.run() + fav.write() + + +if __name__ == '__main__': + while len(logging.root.handlers) > 0: + logging.root.removeHandler(logging.root.handlers[-1]) + + logging.basicConfig( + level=20, + format='%(asctime)s - %(levelname)s - %(message)s' + ) + + flickr = FlickrFavs() + flickr.run() + + hn = HNBookmarks() + hn.run() + + fivehpx = FivehpxFavs() + fivehpx.run() diff --git a/search.py b/search.py old mode 100644 new mode 100755 index e1cc10d..daa7cce --- a/search.py +++ b/search.py @@ -1,9 +1,11 @@ #!/usr/bin/env python3 +import os +#import sys +#sys.path.append(os.path.dirname(os.path.abspath(__file__))) + import asyncio import uvloop -import os - from sanic import Sanic import sanic.response from sanic.log import log as logging @@ -66,8 +68,8 @@ if __name__ == '__main__': jenv = jinja2.Environment(loader=jldr) tmpl = jenv.get_template('searchresults.html') - @app.route("/search") - async def search(request, methods=["GET"]): + @app.route("/search", methods=["GET"]) + async def search(request): query = request.args.get('s') r = SearchHandler(query, tmpl) return r diff --git a/shared.py b/shared.py index cdcfd00..3beffd4 100644 --- a/shared.py +++ b/shared.py @@ -25,6 +25,18 @@ def __expandconfig(config): )) return config +def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"): + """ Used to create short, lowecase slug for a number (an epoch) passed """ + num = int(num) + return ((num == 0) and numerals[0]) or ( + baseN( + num // b, + b, + numerals + ).lstrip(numerals[0]) + numerals[num % b] + ) + + ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ' STRFISO = '%Y-%m-%dT%H:%M:%S%z' diff --git a/webmention.py b/webmention.py old mode 100644 new mode 100755 index 363880d..44f23d7 --- a/webmention.py +++ b/webmention.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import asyncio import uvloop import os @@ -111,8 +113,7 @@ class WebmentionHandler(object): def _save(self): target = os.path.join( shared.config.get('source', 'commentsdir'), - self.mhash, - '.md' + "%s.md" % self.mhash ) if os.path.isfile(target):