nasg/pesos.py

694 lines
20 KiB
Python
Executable file

#!/usr/bin/env python3
import json
import os
import hashlib
import glob
import frontmatter
import requests
import shared
import logging
import re
import shutil
import arrow
import bs4
from slugify import slugify
import oauth
import argparse
class Bookmark(object):
def __init__(self, title, url, fname=None):
self.fm = frontmatter.loads('')
fname = fname or slugify(title)
self.fname = "%s.md" % fname
self.target = os.path.join(
shared.config.get('source', 'contentdir'),
shared.config.get('source', 'bookmarks'),
self.fname
)
self.fm.metadata = {
'published': arrow.utcnow().format(shared.ARROWISO),
'title': title,
'bookmark-of': url,
}
def write(self):
logging.info('saving bookmark to %s', self.target)
with open(self.target, 'wt') as t:
t.write(frontmatter.dumps(self.fm))
class HNBookmarks(object):
prefix = 'hn-'
def __init__(self):
self.url = 'https://news.ycombinator.com/favorites?id=%s' % (
shared.config.get('hackernews', 'user_id')
)
@property
def existing(self):
if hasattr(self, '_existing'):
return self._existing
d = os.path.join(
shared.config.get('source', 'contentdir'),
"*",
"%s*.md" % self.prefix
)
files = reversed(sorted(glob.glob(d)))
self._existing = [
os.path.basename(f.replace(self.prefix, '').replace('.md', ''))
for f in files
]
return self._existing
def run(self):
r = requests.get(self.url)
soup = bs4.BeautifulSoup(r.text, "html5lib")
rows = soup.find_all('tr', attrs={'class':'athing' })
for row in rows:
rid = row.get('id')
if rid in self.existing:
continue
link = row.find('a', attrs={'class':'storylink' })
url = link.get('href')
title = " ".join(link.contents)
fname = "%s%s" % (self.prefix, rid)
bookmark = Bookmark(title, url, fname)
bookmark.write()
class Fav(object):
def __init__(self):
self.arrow = arrow.utcnow()
self.fm = frontmatter.loads('')
@property
def target(self):
return os.path.join(
shared.config.get('source', 'contentdir'),
shared.config.get('source', 'favs'),
self.fname
)
@property
def exists(self):
return os.path.isfile(self.target)
@property
def imgname(self):
# the _ is to differentiate between my photos, where the md and jpg name is the same, and favs
return self.fname.replace('.md', '_.jpg')
@property
def imgtarget(self):
return os.path.join(
shared.config.get('source', 'filesdir'),
self.imgname
)
def saveimg(self, url, target=None):
target = target or self.imgtarget
if os.path.isfile(target):
logging.error("%s already exists, refusing to overwrite", target)
return
logging.info("pulling image %s to files", url)
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(target, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
def write(self):
logging.info('saving fav to %s', self.target)
with open(self.target, 'wt') as t:
t.write(frontmatter.dumps(self.fm))
os.utime(self.target, (self.arrow.timestamp, self.arrow.timestamp))
class PinterestFav(Fav):
def __init__(self, url):
super(PinterestFav, self).__init__()
self.url = url
self.fname = "pinterest-%s.md" % (list(filter(None, url.split('/')))[-1])
def run(self):
try:
r = requests.get(self.url)
soup = bs4.BeautifulSoup(r.text, 'lxml')
ld = json.loads(soup.find('script', type='application/ld+json').text)
imgurl = ld.get('image')
self.saveimg(imgurl)
self.fm.metadata = {
'published': arrow.get(
ld.get('datePublished', arrow.utcnow().timestamp)
).format(shared.ARROWISO),
'title': ld.get('headline', self.url),
'favorite-of': self.url,
'image': self.imgname
}
content = ld.get('articleBody', '')
content = shared.Pandoc(False).convert(content)
self.fm.content = content
except Exception as e:
logging.error('saving pinterest fav %s failed: %s', self.url, e)
return
class FlickrFav(Fav):
url = 'https://api.flickr.com/services/rest/'
def __init__(self, photo):
super(FlickrFav, self).__init__()
self.photo = photo
self.ownerid = photo.get('owner')
self.photoid = photo.get('id')
self.fname = "flickr-%s-%s.md" % (self.ownerid, self.photoid)
self.url = "https://www.flickr.com/photos/%s/%s" % (self.ownerid, self.photoid)
def run(self):
img = self.photo.get('url_b', self.photo.get('url_z', False))
if not img:
logging.error("image url was empty for %s, skipping fav", self.url)
return
self.saveimg(img)
self.arrow = arrow.get(
self.photo.get('date_faved', arrow.utcnow().timestamp)
)
self.fm.metadata = {
'published': self.arrow.format(shared.ARROWISO),
'title': '%s' % self.photo.get('title', self.fname),
'favorite-of': self.url,
'flickr_tags': self.photo.get('tags', '').split(' '),
'geo': {
'latitude': self.photo.get('latitude', ''),
'longitude': self.photo.get('longitude', ''),
},
'author': {
'name': self.photo.get('owner_name'),
'url': 'https://www.flickr.com/people/%s' % (
self.photo.get('owner')
),
},
'image': self.imgname
}
content = self.photo.get('description', {}).get('_content', '')
content = shared.Pandoc(False).convert(content)
self.fm.content = content
class FivehpxFav(Fav):
def __init__(self, photo):
super(FivehpxFav, self).__init__()
self.photo = photo
self.ownerid = photo.get('user_id')
self.photoid = photo.get('id')
self.fname = "500px-%s-%s.md" % (self.ownerid, self.photoid)
self.url = "https://www.500px.com%s" % (photo.get('url'))
def run(self):
img = self.photo.get('images')[0].get('url')
if not img:
logging.error("image url was empty for %s, skipping fav", self.url)
return
self.saveimg(img)
self.arrow = arrow.get(
self.photo.get('created_at', arrow.utcnow().timestamp)
)
self.fm.metadata = {
'published': self.arrow.format(shared.ARROWISO),
'title': '%s' % self.photo.get('name', self.fname),
'favorite-of': self.url,
'fivehpx_tags': self.photo.get('tags', []),
'geo': {
'latitude': self.photo.get('latitude', ''),
'longitude': self.photo.get('longitude', ''),
},
'author': {
'name': self.photo.get('user').get('fullname', self.ownerid),
'url': 'https://www.500px.com/%s' % (
self.photo.get('user').get('username', self.ownerid)
),
},
'image': self.imgname
}
content = self.photo.get('description', '')
if content:
content = shared.Pandoc(False).convert(content)
else:
content = ''
self.fm.content = content
class TumblrFav(Fav):
def __init__(self, like):
super(TumblrFav, self).__init__()
self.like = like
self.blogname = like.get('blog_name')
self.postid = like.get('id')
self.fname = "tumblr-%s-%s.md" % (self.blogname, self.postid)
self.url = like.get('post_url')
self.images = []
def run(self):
icntr = 0
for p in self.like.get('photos', []):
i = p.get('original_size').get('url')
logging.debug('parsing image %s', i)
n = self.fname.replace('.md', '_%d.jpg' % icntr)
self.images.append(n)
nt = os.path.join(
shared.config.get('source', 'filesdir'),
n
)
self.saveimg(i, nt)
icntr = icntr + 1
self.arrow = arrow.get(
self.like.get('liked_timestamp',
self.like.get('date',
arrow.utcnow().timestamp
)
)
)
self.fm.content = self.like.get('caption', '')
title = self.like.get('summary', '').strip()
if not len(title):
title = self.like.get('slug', '').strip()
if not len(title):
title = shared.slugfname(self.like.get('post_url'))
self.fm.metadata = {
'published': self.arrow.format(shared.ARROWISO),
'title': title,
'favorite-of': self.url,
'tumblr_tags': self.like.get('tags'),
'author': {
'name': self.like.get('blog_name'),
'url': 'http://%s.tumblr.com' % self.like.get('blog_name')
},
'images': self.images
}
class DAFav(Fav):
def __init__(self, fav):
super(DAFav, self).__init__()
self.fav = fav
self.deviationid = fav.get('deviationid')
self.url = fav.get('url')
self.title = fav.get('title', False) or self.deviationid
self.author = self.fav.get('author').get('username')
self.fname = "deviantart-%s-by-%s.md" % (
slugify(self.title), slugify(self.author)
)
self.image = fav.get('content', {}).get('src')
def run(self):
self.saveimg(self.image)
self.arrow = arrow.get(
self.fav.get('published_time', arrow.utcnow().timestamp)
)
self.fm.metadata = {
'published': self.arrow.format(shared.ARROWISO),
'title': '%s' % self.title,
'favorite-of': self.url,
'da_tags': [t.get('tag_name') for t in self.fav.get('meta', {}).get('tags', [])],
'author': {
'name': self.author,
'url': 'https://%s.deviantart.com' % (self.author),
},
'image': self.imgname
}
content = self.fav.get('meta', {}).get('description', '')
content = shared.Pandoc(False).convert(content)
self.fm.content = content
class Favs(object):
def __init__(self, confgroup):
self.confgroup = confgroup
@property
def lastpulled(self):
mtime = 0
d = os.path.join(
shared.config.get('source', 'contentdir'),
shared.config.get('source', 'favs'),
"%s-*.md" % self.confgroup
)
files = glob.glob(d)
for f in files:
ftime = int(os.path.getmtime(f))
if ftime > mtime:
mtime = ftime
mtime = mtime + 1
logging.debug("last flickr fav timestamp: %s", mtime)
return mtime
class FlickrFavs(Favs):
url = 'https://api.flickr.com/services/rest/'
def __init__(self):
super(FlickrFavs, self).__init__('flickr')
self.get_uid()
self.params = {
'method': 'flickr.favorites.getList',
'api_key': shared.config.get('flickr', 'api_key'),
'user_id': self.uid,
'extras': 'description,geo,tags,url_z,url_b,owner_name,date_upload',
'per_page': 500, # maximim
'format': 'json',
'nojsoncallback': '1',
'min_fave_date': self.lastpulled
}
def get_uid(self):
params = {
'method': 'flickr.people.findByUsername',
'api_key': shared.config.get('flickr', 'api_key'),
'format': 'json',
'nojsoncallback': '1',
'username': shared.config.get('flickr', 'username'),
}
r = requests.get(
self.url,
params=params
)
parsed = json.loads(r.text)
self.uid = parsed.get('user', {}).get('id')
def getpaged(self, offset):
logging.info('requesting page #%d of paginated results', offset)
self.params.update({
'page': offset
})
r = requests.get(
self.url,
params=self.params
)
parsed = json.loads(r.text)
return parsed.get('photos', {}).get('photo', [])
def run(self):
r = requests.get(self.url,params=self.params)
js = json.loads(r.text)
js = js.get('photos', {})
photos = js.get('photo', [])
total = int(js.get('pages', 1))
current = int(js.get('page', 1))
cntr = total - current
while cntr > 0:
current = current + 1
paged = self.getpaged(current)
photos = photos + paged
cntr = total - current
for photo in photos:
fav = FlickrFav(photo)
if not fav.exists:
fav.run()
fav.write()
class FivehpxFavs(Favs):
def __init__(self):
super(FivehpxFavs, self).__init__('500px')
self.params = {
'consumer_key': shared.config.get('500px', 'api_key'),
'rpp': 100, # maximum
'image_size': 4,
'include_tags': 1,
'include_geo': 1,
'sort': 'created_at',
'sort_direction': 'desc'
}
self.oauth = oauth.FivehpxOauth()
self.uid = None
self.galid = None
def get_uid(self):
r = self.oauth.request(
'https://api.500px.com/v1/users',
params={}
)
js = json.loads(r.text)
self.uid = js.get('user', {}).get('id')
def get_favgalid(self):
r = self.oauth.request(
'https://api.500px.com/v1/users/%s/galleries' % (self.uid),
params={
'kinds': 5 # see https://github.com/500px/api-documentation/blob/master/basics/formats_and_terms.md#gallery-kinds
}
)
js = json.loads(r.text)
g = js.get('galleries', []).pop()
self.galid = g.get('id')
@property
def url(self):
return 'https://api.500px.com/v1/users/%s/galleries/%s/items' % (
self.uid,
self.galid
)
def getpaged(self, offset):
logging.info('requesting page #%d of paginated results', offset)
self.params.update({
'page': offset
})
r = requests.get(
self.url,
params=self.params
)
parsed = json.loads(r.text)
return parsed.get('photos')
def run(self):
self.get_uid()
self.get_favgalid()
r = requests.get(self.url,params=self.params)
js = json.loads(r.text)
photos = js.get('photos')
total = int(js.get('total_pages', 1))
current = int(js.get('current_page', 1))
cntr = total - current
while cntr > 0:
current = current + 1
paged = self.getpaged(current)
photos = photos + paged
cntr = total - current
for photo in photos:
fav = FivehpxFav(photo)
if not fav.exists:
fav.run()
fav.write()
class TumblrFavs(Favs):
url = 'https://api.tumblr.com/v2/user/likes'
def __init__(self):
super(TumblrFavs, self).__init__('tumblr')
self.oauth = oauth.TumblrOauth()
self.params = {
'after': self.lastpulled
}
self.likes = []
def getpaged(self, offset):
r = self.oauth.request(
self.url,
params={'offset': offset}
)
return json.loads(r.text)
def run(self):
r = self.oauth.request(
self.url,
params=self.params
)
js = json.loads(r.text)
total = int(js.get('response', {}).get('liked_count', 20))
offset = 20
cntr = total - offset
likes = js.get('response', {}).get('liked_posts', [])
while cntr > 0:
paged = self.getpaged(offset)
likes = likes + paged.get('response', {}).get('liked_posts', [])
offset = offset + 20
cntr = total - offset
self.likes = likes
for like in self.likes:
fav = TumblrFav(like)
if not fav.exists:
fav.run()
fav.write()
class DAFavs(Favs):
def __init__(self):
from pprint import pprint
super(DAFavs, self).__init__('deviantart')
self.username = shared.config.get(self.confgroup, 'username'),
self.oauth = oauth.DAOauth()
self.likes = []
self.galid = None
self.params = {
'limit': 24, # this is the max as far as I can tell
'mature_content': 'true',
'username': self.username
}
def get_favgalid(self):
r = self.oauth.request(
'https://www.deviantart.com/api/v1/oauth2/collections/folders',
params={
'username': self.username,
'calculate_size': 'false',
'ext_preload': 'false',
'mature_content': 'true'
}
)
js = json.loads(r.text)
for g in js.get('results', []):
if 'Featured' == g.get('name'):
self.galid = g.get('folderid')
break
@property
def url(self):
return 'https://www.deviantart.com/api/v1/oauth2/collections/%s' % (self.galid)
def getpaged(self, offset):
self.params.update({'offset': offset})
r = self.oauth.request(
self.url,
self.params
)
js = json.loads(r.text)
return js
def getsinglemeta(self, daid):
r = self.oauth.request(
'https://www.deviantart.com/api/v1/oauth2/deviation/metadata',
params={
'deviationids[]': daid,
'ext_submission': False,
'ext_camera': False,
'ext_stats': False,
'ext_collection': False,
'mature_content': True,
}
)
meta = {}
try:
meta = json.loads(r.text)
return meta.get('metadata', []).pop()
except:
return meta
def has_more(self, q):
if True == q or 'True' == q or 'true' == q:
return True
return False
def run(self):
self.get_favgalid()
r = self.oauth.request(
self.url,
self.params
)
js = json.loads(r.text)
favs = js.get('results', [])
has_more = self.has_more(js.get('has_more'))
offset = js.get('next_offset')
while True == has_more:
logging.info('iterating over DA results with offset %d', offset)
paged = self.getpaged(offset)
new = paged.get('results', [])
if not len(new):
#logging.error('empty results from deviantART, breaking loop')
break
favs = favs + new
has_more = self.has_more(paged.get('has_more'))
if not has_more:
break
n = int(paged.get('next_offset'))
if not n:
break
offset = offset + n
self.favs = favs
for fav in self.favs:
f = DAFav(fav)
if f.exists:
continue
f.fav.update({'meta': self.getsinglemeta(fav.get('deviationid'))})
f.run()
f.write()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Parameters for NASG')
parser.add_argument(
'--loglevel',
default='error',
help='change loglevel'
)
params = vars(parser.parse_args())
while len(logging.root.handlers) > 0:
logging.root.removeHandler(logging.root.handlers[-1])
logging.basicConfig(
level=shared.LLEVEL[params.get('loglevel')],
format='%(asctime)s - %(levelname)s - %(message)s'
)
flickr = FlickrFavs()
flickr.run()
hn = HNBookmarks()
hn.run()
fivehpx = FivehpxFavs()
fivehpx.run()
tumblr = TumblrFavs()
tumblr.run()
da = DAFavs()
da.run()