#!/usr/bin/env python3 import json import os import hashlib import glob import frontmatter import requests import shared import logging import re import shutil import arrow import bs4 from slugify import slugify import oauth import argparse class Bookmark(object): def __init__(self, title, url, fname=None): self.fm = frontmatter.loads('') fname = fname or slugify(title) self.fname = "%s.md" % fname self.target = os.path.join( shared.config.get('source', 'contentdir'), shared.config.get('source', 'bookmarks'), self.fname ) self.fm.metadata = { 'published': arrow.utcnow().format(shared.ARROWISO), 'title': title, 'bookmark-of': url, } def write(self): logging.info('saving bookmark to %s', self.target) with open(self.target, 'wt') as t: t.write(frontmatter.dumps(self.fm)) class HNBookmarks(object): prefix = 'hn-' def __init__(self): self.url = 'https://news.ycombinator.com/favorites?id=%s' % ( shared.config.get('hackernews', 'user_id') ) @property def existing(self): if hasattr(self, '_existing'): return self._existing d = os.path.join( shared.config.get('source', 'contentdir'), "*", "%s*.md" % self.prefix ) files = reversed(sorted(glob.glob(d))) self._existing = [ os.path.basename(f.replace(self.prefix, '').replace('.md', '')) for f in files ] return self._existing def run(self): r = requests.get(self.url) soup = bs4.BeautifulSoup(r.text, "html5lib") rows = soup.find_all('tr', attrs={'class':'athing' }) for row in rows: rid = row.get('id') if rid in self.existing: continue link = row.find('a', attrs={'class':'storylink' }) url = link.get('href') title = " ".join(link.contents) fname = "%s%s" % (self.prefix, rid) bookmark = Bookmark(title, url, fname) bookmark.write() class Fav(object): def __init__(self): self.arrow = arrow.utcnow() self.fm = frontmatter.loads('') @property def target(self): return os.path.join( shared.config.get('source', 'contentdir'), shared.config.get('source', 'favs'), self.fname ) @property def exists(self): return os.path.isfile(self.target) @property def imgname(self): # the _ is to differentiate between my photos, where the md and jpg name is the same, and favs return self.fname.replace('.md', '_.jpg') @property def imgtarget(self): return os.path.join( shared.config.get('source', 'filesdir'), self.imgname ) def saveimg(self, url, target=None): target = target or self.imgtarget if os.path.isfile(target): logging.error("%s already exists, refusing to overwrite", target) return logging.info("pulling image %s to files", url) r = requests.get(url, stream=True) if r.status_code == 200: with open(target, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) def write(self): logging.info('saving fav to %s', self.target) with open(self.target, 'wt') as t: t.write(frontmatter.dumps(self.fm)) os.utime(self.target, (self.arrow.timestamp, self.arrow.timestamp)) class PinterestFav(Fav): def __init__(self, url): super(PinterestFav, self).__init__() self.url = url self.fname = "pinterest-%s.md" % (list(filter(None, url.split('/')))[-1]) def run(self): try: r = requests.get(self.url) soup = bs4.BeautifulSoup(r.text, 'lxml') ld = json.loads(soup.find('script', type='application/ld+json').text) imgurl = ld.get('image') self.saveimg(imgurl) self.fm.metadata = { 'published': arrow.get( ld.get('datePublished', arrow.utcnow().timestamp) ).format(shared.ARROWISO), 'title': ld.get('headline', self.url), 'favorite-of': self.url, 'image': self.imgname } content = ld.get('articleBody', '') content = shared.Pandoc(False).convert(content) self.fm.content = content except Exception as e: logging.error('saving pinterest fav %s failed: %s', self.url, e) return class FlickrFav(Fav): url = 'https://api.flickr.com/services/rest/' def __init__(self, photo): super(FlickrFav, self).__init__() self.photo = photo self.ownerid = photo.get('owner') self.photoid = photo.get('id') self.fname = "flickr-%s-%s.md" % (self.ownerid, self.photoid) self.url = "https://www.flickr.com/photos/%s/%s" % (self.ownerid, self.photoid) def run(self): img = self.photo.get('url_b', self.photo.get('url_z', False)) if not img: logging.error("image url was empty for %s, skipping fav", self.url) return self.saveimg(img) self.arrow = arrow.get( self.photo.get('date_faved', arrow.utcnow().timestamp) ) self.fm.metadata = { 'published': self.arrow.format(shared.ARROWISO), 'title': '%s' % self.photo.get('title', self.fname), 'favorite-of': self.url, 'flickr_tags': self.photo.get('tags', '').split(' '), 'geo': { 'latitude': self.photo.get('latitude', ''), 'longitude': self.photo.get('longitude', ''), }, 'author': { 'name': self.photo.get('owner_name'), 'url': 'https://www.flickr.com/people/%s' % ( self.photo.get('owner') ), }, 'image': self.imgname } content = self.photo.get('description', {}).get('_content', '') content = shared.Pandoc(False).convert(content) self.fm.content = content class FivehpxFav(Fav): def __init__(self, photo): super(FivehpxFav, self).__init__() self.photo = photo self.ownerid = photo.get('user_id') self.photoid = photo.get('id') self.fname = "500px-%s-%s.md" % (self.ownerid, self.photoid) self.url = "https://www.500px.com%s" % (photo.get('url')) def run(self): img = self.photo.get('images')[0].get('url') if not img: logging.error("image url was empty for %s, skipping fav", self.url) return self.saveimg(img) self.arrow = arrow.get( self.photo.get('created_at', arrow.utcnow().timestamp) ) self.fm.metadata = { 'published': self.arrow.format(shared.ARROWISO), 'title': '%s' % self.photo.get('name', self.fname), 'favorite-of': self.url, 'fivehpx_tags': self.photo.get('tags', []), 'geo': { 'latitude': self.photo.get('latitude', ''), 'longitude': self.photo.get('longitude', ''), }, 'author': { 'name': self.photo.get('user').get('fullname', self.ownerid), 'url': 'https://www.500px.com/%s' % ( self.photo.get('user').get('username', self.ownerid) ), }, 'image': self.imgname } content = self.photo.get('description', '') if content: content = shared.Pandoc(False).convert(content) else: content = '' self.fm.content = content class TumblrFav(Fav): def __init__(self, like): super(TumblrFav, self).__init__() self.like = like self.blogname = like.get('blog_name') self.postid = like.get('id') self.fname = "tumblr-%s-%s.md" % (self.blogname, self.postid) self.url = like.get('post_url') self.images = [] def run(self): icntr = 0 for p in self.like.get('photos', []): i = p.get('original_size').get('url') logging.debug('parsing image %s', i) n = self.fname.replace('.md', '_%d.jpg' % icntr) self.images.append(n) nt = os.path.join( shared.config.get('source', 'filesdir'), n ) self.saveimg(i, nt) icntr = icntr + 1 self.arrow = arrow.get( self.like.get('liked_timestamp', self.like.get('date', arrow.utcnow().timestamp ) ) ) self.fm.content = self.like.get('caption', '') title = self.like.get('summary', '').strip() if not len(title): title = self.like.get('slug', '').strip() if not len(title): title = shared.slugfname(self.like.get('post_url')) self.fm.metadata = { 'published': self.arrow.format(shared.ARROWISO), 'title': title, 'favorite-of': self.url, 'tumblr_tags': self.like.get('tags'), 'author': { 'name': self.like.get('blog_name'), 'url': 'http://%s.tumblr.com' % self.like.get('blog_name') }, 'images': self.images } class DAFav(Fav): def __init__(self, fav): super(DAFav, self).__init__() self.fav = fav self.deviationid = fav.get('deviationid') self.url = fav.get('url') self.title = fav.get('title', False) or self.deviationid self.author = self.fav.get('author').get('username') self.fname = "deviantart-%s-by-%s.md" % ( slugify(self.title), slugify(self.author) ) self.image = fav.get('content', {}).get('src') def run(self): self.saveimg(self.image) self.arrow = arrow.get( self.fav.get('published_time', arrow.utcnow().timestamp) ) self.fm.metadata = { 'published': self.arrow.format(shared.ARROWISO), 'title': '%s' % self.title, 'favorite-of': self.url, 'da_tags': [t.get('tag_name') for t in self.fav.get('meta', {}).get('tags', [])], 'author': { 'name': self.author, 'url': 'https://%s.deviantart.com' % (self.author), }, 'image': self.imgname } content = self.fav.get('meta', {}).get('description', '') content = shared.Pandoc(False).convert(content) self.fm.content = content class Favs(object): def __init__(self, confgroup): self.confgroup = confgroup @property def lastpulled(self): mtime = 0 d = os.path.join( shared.config.get('source', 'contentdir'), shared.config.get('source', 'favs'), "%s-*.md" % self.confgroup ) files = glob.glob(d) for f in files: ftime = int(os.path.getmtime(f)) if ftime > mtime: mtime = ftime mtime = mtime + 1 logging.debug("last flickr fav timestamp: %s", mtime) return mtime class FlickrFavs(Favs): url = 'https://api.flickr.com/services/rest/' def __init__(self): super(FlickrFavs, self).__init__('flickr') self.get_uid() self.params = { 'method': 'flickr.favorites.getList', 'api_key': shared.config.get('flickr', 'api_key'), 'user_id': self.uid, 'extras': 'description,geo,tags,url_z,url_b,owner_name,date_upload', 'per_page': 500, # maximim 'format': 'json', 'nojsoncallback': '1', 'min_fave_date': self.lastpulled } def get_uid(self): params = { 'method': 'flickr.people.findByUsername', 'api_key': shared.config.get('flickr', 'api_key'), 'format': 'json', 'nojsoncallback': '1', 'username': shared.config.get('flickr', 'username'), } r = requests.get( self.url, params=params ) parsed = json.loads(r.text) self.uid = parsed.get('user', {}).get('id') def getpaged(self, offset): logging.info('requesting page #%d of paginated results', offset) self.params.update({ 'page': offset }) r = requests.get( self.url, params=self.params ) parsed = json.loads(r.text) return parsed.get('photos', {}).get('photo', []) def run(self): r = requests.get(self.url,params=self.params) js = json.loads(r.text) js = js.get('photos', {}) photos = js.get('photo', []) total = int(js.get('pages', 1)) current = int(js.get('page', 1)) cntr = total - current while cntr > 0: current = current + 1 paged = self.getpaged(current) photos = photos + paged cntr = total - current for photo in photos: fav = FlickrFav(photo) if not fav.exists: fav.run() fav.write() class FivehpxFavs(Favs): def __init__(self): super(FivehpxFavs, self).__init__('500px') self.params = { 'consumer_key': shared.config.get('500px', 'api_key'), 'rpp': 100, # maximum 'image_size': 4, 'include_tags': 1, 'include_geo': 1, 'sort': 'created_at', 'sort_direction': 'desc' } self.oauth = oauth.FivehpxOauth() self.uid = None self.galid = None def get_uid(self): r = self.oauth.request( 'https://api.500px.com/v1/users', params={} ) js = json.loads(r.text) self.uid = js.get('user', {}).get('id') def get_favgalid(self): r = self.oauth.request( 'https://api.500px.com/v1/users/%s/galleries' % (self.uid), params={ 'kinds': 5 # see https://github.com/500px/api-documentation/blob/master/basics/formats_and_terms.md#gallery-kinds } ) js = json.loads(r.text) g = js.get('galleries', []).pop() self.galid = g.get('id') @property def url(self): return 'https://api.500px.com/v1/users/%s/galleries/%s/items' % ( self.uid, self.galid ) def getpaged(self, offset): logging.info('requesting page #%d of paginated results', offset) self.params.update({ 'page': offset }) r = requests.get( self.url, params=self.params ) parsed = json.loads(r.text) return parsed.get('photos') def run(self): self.get_uid() self.get_favgalid() r = requests.get(self.url,params=self.params) js = json.loads(r.text) photos = js.get('photos') total = int(js.get('total_pages', 1)) current = int(js.get('current_page', 1)) cntr = total - current while cntr > 0: current = current + 1 paged = self.getpaged(current) photos = photos + paged cntr = total - current for photo in photos: fav = FivehpxFav(photo) if not fav.exists: fav.run() fav.write() class TumblrFavs(Favs): url = 'https://api.tumblr.com/v2/user/likes' def __init__(self): super(TumblrFavs, self).__init__('tumblr') self.oauth = oauth.TumblrOauth() self.params = { 'after': self.lastpulled } self.likes = [] def getpaged(self, offset): r = self.oauth.request( self.url, params={'offset': offset} ) return json.loads(r.text) def run(self): r = self.oauth.request( self.url, params=self.params ) js = json.loads(r.text) total = int(js.get('response', {}).get('liked_count', 20)) offset = 20 cntr = total - offset likes = js.get('response', {}).get('liked_posts', []) while cntr > 0: paged = self.getpaged(offset) likes = likes + paged.get('response', {}).get('liked_posts', []) offset = offset + 20 cntr = total - offset self.likes = likes for like in self.likes: fav = TumblrFav(like) if not fav.exists: fav.run() fav.write() class DAFavs(Favs): def __init__(self): from pprint import pprint super(DAFavs, self).__init__('deviantart') self.username = shared.config.get(self.confgroup, 'username'), self.oauth = oauth.DAOauth() self.likes = [] self.galid = None self.params = { 'limit': 24, # this is the max as far as I can tell 'mature_content': 'true', 'username': self.username } def get_favgalid(self): r = self.oauth.request( 'https://www.deviantart.com/api/v1/oauth2/collections/folders', params={ 'username': self.username, 'calculate_size': 'false', 'ext_preload': 'false', 'mature_content': 'true' } ) js = json.loads(r.text) for g in js.get('results', []): if 'Featured' == g.get('name'): self.galid = g.get('folderid') break @property def url(self): return 'https://www.deviantart.com/api/v1/oauth2/collections/%s' % (self.galid) def getpaged(self, offset): self.params.update({'offset': offset}) r = self.oauth.request( self.url, self.params ) js = json.loads(r.text) return js def getsinglemeta(self, daid): r = self.oauth.request( 'https://www.deviantart.com/api/v1/oauth2/deviation/metadata', params={ 'deviationids[]': daid, 'ext_submission': False, 'ext_camera': False, 'ext_stats': False, 'ext_collection': False, 'mature_content': True, } ) meta = {} try: meta = json.loads(r.text) return meta.get('metadata', []).pop() except: return meta def has_more(self, q): if True == q or 'True' == q or 'true' == q: return True return False def run(self): self.get_favgalid() r = self.oauth.request( self.url, self.params ) js = json.loads(r.text) favs = js.get('results', []) has_more = self.has_more(js.get('has_more')) offset = js.get('next_offset') while True == has_more: logging.info('iterating over DA results with offset %d', offset) paged = self.getpaged(offset) new = paged.get('results', []) if not len(new): #logging.error('empty results from deviantART, breaking loop') break favs = favs + new has_more = self.has_more(paged.get('has_more')) if not has_more: break n = int(paged.get('next_offset')) if not n: break offset = offset + n self.favs = favs for fav in self.favs: f = DAFav(fav) if f.exists: continue f.fav.update({'meta': self.getsinglemeta(fav.get('deviationid'))}) f.run() f.write() if __name__ == '__main__': parser = argparse.ArgumentParser(description='Parameters for NASG') parser.add_argument( '--loglevel', default='error', help='change loglevel' ) params = vars(parser.parse_args()) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging.basicConfig( level=shared.LLEVEL[params.get('loglevel')], format='%(asctime)s - %(levelname)s - %(message)s' ) flickr = FlickrFavs() flickr.run() hn = HNBookmarks() hn.run() fivehpx = FivehpxFavs() fivehpx.run() tumblr = TumblrFavs() tumblr.run() da = DAFavs() da.run()