petermolnar's repositories — nasg (a990dc27ae07970af67a5bcc3fe09a5dcc6646a7): pesos.py

pesos.py (view raw)
#!/usr/bin/env python3

import json
import os
import hashlib
import glob
import frontmatter
import requests
import shared
import logging
import re
import shutil
import arrow
import bs4
from slugify import slugify
import oauth
import argparse


class Bookmark(object):
    def __init__(self, title, url, fname=None):
        self.fm = frontmatter.loads('')
        fname = fname or slugify(title)
        self.fname = "%s.md" % fname
        self.target = os.path.join(
            shared.config.get('source', 'contentdir'),
            shared.config.get('source', 'bookmarks'),
            self.fname
        )
        self.fm.metadata = {
            'published': arrow.utcnow().format(shared.ARROWISO),
            'title': title,
            'bookmark-of': url,
        }

    def write(self):
        logging.info('saving bookmark to %s', self.target)
        with open(self.target, 'wt') as t:
            t.write(frontmatter.dumps(self.fm))

class HNBookmarks(object):
    prefix = 'hn-'
    def __init__(self):
        self.url = 'https://news.ycombinator.com/favorites?id=%s' % (
            shared.config.get('hackernews', 'user_id')
        )

    @property
    def existing(self):
        if hasattr(self, '_existing'):
            return self._existing

        d = os.path.join(
            shared.config.get('source', 'contentdir'),
            "*",
            "%s*.md" % self.prefix
        )
        files = reversed(sorted(glob.glob(d)))
        self._existing = [
            os.path.basename(f.replace(self.prefix, '').replace('.md', ''))
            for f in files
        ]

        return self._existing

    def run(self):
        r = requests.get(self.url)
        soup = bs4.BeautifulSoup(r.text, "html5lib")
        rows = soup.find_all('tr', attrs={'class':'athing' })
        for row in rows:
            rid = row.get('id')
            if rid in self.existing:
                continue

            link = row.find('a', attrs={'class':'storylink' })
            url = link.get('href')
            title = " ".join(link.contents)
            fname = "%s%s" % (self.prefix, rid)

            bookmark = Bookmark(title, url, fname)
            bookmark.write()

class Fav(object):
    def __init__(self):
        self.arrow = arrow.utcnow()
        self.fm = frontmatter.loads('')

    @property
    def target(self):
        return os.path.join(
            shared.config.get('source', 'contentdir'),
            shared.config.get('source', 'favs'),
            self.fname
        )

    @property
    def exists(self):
        return os.path.isfile(self.target)

    @property
    def imgname(self):
        # the _ is to differentiate between my photos, where the md and jpg name is the same, and favs
        return self.fname.replace('.md', '_.jpg')

    @property
    def imgtarget(self):
        return os.path.join(
            shared.config.get('source', 'filesdir'),
            self.imgname
        )

    def saveimg(self, url, target=None):
        target = target or self.imgtarget
        if os.path.isfile(target):
            logging.error("%s already exists, refusing to overwrite", target)
            return

        logging.info("pulling image %s to files", url)
        r = requests.get(url, stream=True)
        if r.status_code == 200:
            with open(target, 'wb') as f:
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)

    def write(self):
        logging.info('saving fav to %s', self.target)
        with open(self.target, 'wt') as t:
            t.write(frontmatter.dumps(self.fm))
        os.utime(self.target, (self.arrow.timestamp, self.arrow.timestamp))


class PinterestFav(Fav):
    def __init__(self, url):
        super(PinterestFav, self).__init__()
        self.url = url
        self.fname = "pinterest-%s.md" % (list(filter(None, url.split('/')))[-1])

    def run(self):
        try:
            r = requests.get(self.url)
            soup = bs4.BeautifulSoup(r.text, 'lxml')
            ld = json.loads(soup.find('script', type='application/ld+json').text)
            imgurl = ld.get('image')
            self.saveimg(imgurl)

            self.fm.metadata = {
                'published': arrow.get(
                    ld.get('datePublished', arrow.utcnow().timestamp)
                ).format(shared.ARROWISO),
                'title': ld.get('headline', self.url),
                'favorite-of': self.url,
                'image': self.imgname
            }
            content = ld.get('articleBody', '')
            content = shared.Pandoc(False).convert(content)
            self.fm.content = content

        except Exception as e:
            logging.error('saving pinterest fav %s failed: %s', self.url, e)
            return


class FlickrFav(Fav):
    url = 'https://api.flickr.com/services/rest/'

    def __init__(self, photo):
        super(FlickrFav, self).__init__()
        self.photo = photo
        self.ownerid = photo.get('owner')
        self.photoid = photo.get('id')
        self.fname = "flickr-%s-%s.md" % (self.ownerid, self.photoid)
        self.url = "https://www.flickr.com/photos/%s/%s" % (self.ownerid, self.photoid)

    def run(self):
        img = self.photo.get('url_b', self.photo.get('url_z', False))
        if not img:
            logging.error("image url was empty for %s, skipping fav", self.url)
            return

        self.saveimg(img)
        self.arrow = arrow.get(
            self.photo.get('date_faved', arrow.utcnow().timestamp)
        )
        self.fm.metadata = {
            'published': self.arrow.format(shared.ARROWISO),
            'title': '%s' % self.photo.get('title', self.fname),
            'favorite-of': self.url,
            'flickr_tags': self.photo.get('tags', '').split(' '),
            'geo': {
                'latitude': self.photo.get('latitude', ''),
                'longitude': self.photo.get('longitude', ''),
            },
            'author': {
                'name': self.photo.get('owner_name'),
                'url': 'https://www.flickr.com/people/%s' % (
                    self.photo.get('owner')
                ),
            },
            'image': self.imgname
        }

        content = self.photo.get('description', {}).get('_content', '')
        content = shared.Pandoc(False).convert(content)
        self.fm.content = content


class FivehpxFav(Fav):
    def __init__(self, photo):
        super(FivehpxFav, self).__init__()
        self.photo = photo
        self.ownerid = photo.get('user_id')
        self.photoid = photo.get('id')
        self.fname = "500px-%s-%s.md" % (self.ownerid, self.photoid)
        self.url = "https://www.500px.com%s" % (photo.get('url'))

    def run(self):
        img = self.photo.get('images')[0].get('url')
        if not img:
            logging.error("image url was empty for %s, skipping fav", self.url)
            return

        self.saveimg(img)
        self.arrow = arrow.get(
            self.photo.get('created_at', arrow.utcnow().timestamp)
        )
        self.fm.metadata = {
            'published': self.arrow.format(shared.ARROWISO),
            'title': '%s' % self.photo.get('name', self.fname),
            'favorite-of': self.url,
            'fivehpx_tags': self.photo.get('tags', []),
            'geo': {
                'latitude': self.photo.get('latitude', ''),
                'longitude': self.photo.get('longitude', ''),
            },
            'author': {
                'name': self.photo.get('user').get('fullname', self.ownerid),
                'url': 'https://www.500px.com/%s' % (
                    self.photo.get('user').get('username', self.ownerid)
                ),
            },
            'image': self.imgname
        }

        content = self.photo.get('description', '')
        if content:
            content = shared.Pandoc(False).convert(content)
        else:
            content = ''
        self.fm.content = content


class TumblrFav(Fav):
    def __init__(self, like):
        super(TumblrFav, self).__init__()
        self.like = like
        self.blogname = like.get('blog_name')
        self.postid = like.get('id')
        self.fname = "tumblr-%s-%s.md" % (self.blogname, self.postid)
        self.url = like.get('post_url')
        self.images = []

    def run(self):
        icntr = 0
        for p in self.like.get('photos', []):
            i = p.get('original_size').get('url')
            logging.debug('parsing image %s', i)
            n = self.fname.replace('.md', '_%d.jpg' % icntr)
            self.images.append(n)
            nt = os.path.join(
                shared.config.get('source', 'filesdir'),
                n
            )
            self.saveimg(i, nt)
            icntr = icntr + 1

        self.arrow = arrow.get(
            self.like.get('liked_timestamp',
                self.like.get('date',
                    arrow.utcnow().timestamp
                )
            )
        )

        self.fm.content = self.like.get('caption', '')

        title = self.like.get('summary', '').strip()
        if not len(title):
            title = self.like.get('slug', '').strip()
        if not len(title):
            title = shared.slugfname(self.like.get('post_url'))

        self.fm.metadata = {
            'published': self.arrow.format(shared.ARROWISO),
            'title': title,
            'favorite-of': self.url,
            'tumblr_tags': self.like.get('tags'),
            'author': {
                'name': self.like.get('blog_name'),
                'url': 'http://%s.tumblr.com' % self.like.get('blog_name')
            },
            'images': self.images
        }


class DAFav(Fav):
    def __init__(self, fav):
        super(DAFav, self).__init__()
        self.fav = fav
        self.deviationid = fav.get('deviationid')
        self.url = fav.get('url')
        self.title = fav.get('title', False) or self.deviationid
        self.author = self.fav.get('author').get('username')
        self.fname = "deviantart-%s-by-%s.md" % (
            slugify(self.title), slugify(self.author)
        )
        self.image = fav.get('content', {}).get('src')

    def run(self):
        self.saveimg(self.image)

        self.arrow = arrow.get(
            self.fav.get('published_time', arrow.utcnow().timestamp)
        )

        self.fm.metadata = {
            'published': self.arrow.format(shared.ARROWISO),
            'title': '%s' % self.title,
            'favorite-of': self.url,
            'da_tags': [t.get('tag_name') for t in self.fav.get('meta', {}).get('tags', [])],
            'author': {
                'name': self.author,
                'url': 'https://%s.deviantart.com' % (self.author),
            },
            'image': self.imgname
        }

        content = self.fav.get('meta', {}).get('description', '')
        content = shared.Pandoc(False).convert(content)
        self.fm.content = content


class Favs(object):
    def __init__(self, confgroup):
        self.confgroup = confgroup

    @property
    def lastpulled(self):
        mtime = 0
        d = os.path.join(
            shared.config.get('source', 'contentdir'),
            shared.config.get('source', 'favs'),
            "%s-*.md" % self.confgroup
        )
        files = glob.glob(d)
        for f in files:
            ftime = int(os.path.getmtime(f))
            if ftime > mtime:
                mtime = ftime

        mtime = mtime + 1
        logging.debug("last flickr fav timestamp: %s", mtime)
        return mtime


class FlickrFavs(Favs):
    url = 'https://api.flickr.com/services/rest/'

    def __init__(self):
        super(FlickrFavs, self).__init__('flickr')
        self.get_uid()
        self.params = {
            'method': 'flickr.favorites.getList',
            'api_key': shared.config.get('flickr', 'api_key'),
            'user_id': self.uid,
            'extras': 'description,geo,tags,url_z,url_b,owner_name,date_upload',
            'per_page': 500, # maximim
            'format': 'json',
            'nojsoncallback': '1',
            'min_fave_date': self.lastpulled
        }

    def get_uid(self):
        params = {
            'method': 'flickr.people.findByUsername',
            'api_key': shared.config.get('flickr', 'api_key'),
            'format': 'json',
            'nojsoncallback': '1',
            'username': shared.config.get('flickr', 'username'),
        }
        r = requests.get(
            self.url,
            params=params
        )
        parsed = json.loads(r.text)
        self.uid = parsed.get('user', {}).get('id')


    def getpaged(self, offset):
        logging.info('requesting page #%d of paginated results', offset)
        self.params.update({
            'page': offset
        })
        r = requests.get(
            self.url,
            params=self.params
        )
        parsed = json.loads(r.text)
        return parsed.get('photos', {}).get('photo', [])

    def run(self):
        r = requests.get(self.url,params=self.params)
        js = json.loads(r.text)
        js = js.get('photos', {})

        photos = js.get('photo', [])

        total = int(js.get('pages', 1))
        current = int(js.get('page', 1))
        cntr = total - current

        while cntr > 0:
            current = current + 1
            paged = self.getpaged(current)
            photos = photos + paged
            cntr = total - current

        for photo in photos:
            fav = FlickrFav(photo)
            if not fav.exists:
                fav.run()
                fav.write()

class FivehpxFavs(Favs):
    def __init__(self):
        super(FivehpxFavs, self).__init__('500px')
        self.params = {
            'consumer_key': shared.config.get('500px', 'api_key'),
            'rpp': 100, # maximum
            'image_size': 4,
            'include_tags': 1,
            'include_geo': 1,
            'sort': 'created_at',
            'sort_direction': 'desc'
        }
        self.oauth = oauth.FivehpxOauth()
        self.uid = None
        self.galid = None

    def get_uid(self):
        r = self.oauth.request(
            'https://api.500px.com/v1/users',
            params={}
        )
        js = json.loads(r.text)
        self.uid = js.get('user', {}).get('id')

    def get_favgalid(self):
        r = self.oauth.request(
            'https://api.500px.com/v1/users/%s/galleries' % (self.uid),
            params={
                'kinds': 5 # see https://github.com/500px/api-documentation/blob/master/basics/formats_and_terms.md#gallery-kinds
            }
        )
        js = json.loads(r.text)
        g = js.get('galleries', []).pop()
        self.galid = g.get('id')


    @property
    def url(self):
        return 'https://api.500px.com/v1/users/%s/galleries/%s/items' % (
            self.uid,
            self.galid
        )

    def getpaged(self, offset):
        logging.info('requesting page #%d of paginated results', offset)
        self.params.update({
            'page': offset
        })
        r = requests.get(
            self.url,
            params=self.params
        )
        parsed = json.loads(r.text)
        return parsed.get('photos')

    def run(self):
        self.get_uid()
        self.get_favgalid()

        r = requests.get(self.url,params=self.params)
        js = json.loads(r.text)
        photos = js.get('photos')

        total = int(js.get('total_pages', 1))
        current = int(js.get('current_page', 1))
        cntr = total - current

        while cntr > 0:
            current = current + 1
            paged = self.getpaged(current)
            photos = photos + paged
            cntr = total - current

        for photo in photos:
            fav = FivehpxFav(photo)
            if not fav.exists:
                fav.run()
                fav.write()


class TumblrFavs(Favs):
    url = 'https://api.tumblr.com/v2/user/likes'

    def __init__(self):
        super(TumblrFavs, self).__init__('tumblr')
        self.oauth = oauth.TumblrOauth()
        self.params = {
            'after': self.lastpulled
        }
        self.likes = []

    def getpaged(self, offset):
        r = self.oauth.request(
            self.url,
            params={'offset': offset}
        )
        return json.loads(r.text)

    def run(self):
        r = self.oauth.request(
            self.url,
            params=self.params
        )

        js = json.loads(r.text)
        total = int(js.get('response', {}).get('liked_count', 20))
        offset = 20
        cntr = total - offset
        likes = js.get('response', {}).get('liked_posts', [])
        while cntr > 0:
            paged = self.getpaged(offset)
            likes = likes + paged.get('response', {}).get('liked_posts', [])
            offset = offset + 20
            cntr = total - offset

        self.likes = likes
        for like in self.likes:
            fav = TumblrFav(like)
            if not fav.exists:
                fav.run()
                fav.write()


class DAFavs(Favs):
    def __init__(self):
        from pprint import pprint
        super(DAFavs, self).__init__('deviantart')
        self.username = shared.config.get(self.confgroup, 'username'),
        self.oauth = oauth.DAOauth()
        self.likes = []
        self.galid = None
        self.params = {
            'limit': 24, # this is the max as far as I can tell
            'mature_content': 'true',
            'username': self.username
        }

    def get_favgalid(self):
        r = self.oauth.request(
            'https://www.deviantart.com/api/v1/oauth2/collections/folders',
            params={
                'username': self.username,
                'calculate_size': 'false',
                'ext_preload': 'false',
                'mature_content': 'true'
            }
        )
        js = json.loads(r.text)
        for g in js.get('results', []):
            if 'Featured' == g.get('name'):
                self.galid = g.get('folderid')
                break

    @property
    def url(self):
         return 'https://www.deviantart.com/api/v1/oauth2/collections/%s' % (self.galid)


    def getpaged(self, offset):
        self.params.update({'offset': offset})
        r = self.oauth.request(
            self.url,
            self.params
        )
        js = json.loads(r.text)
        return js

    def getsinglemeta(self, daid):
        r = self.oauth.request(
            'https://www.deviantart.com/api/v1/oauth2/deviation/metadata',
            params={
                'deviationids[]': daid,
                'ext_submission': False,
                'ext_camera': False,
                'ext_stats': False,
                'ext_collection': False,
                'mature_content': True,
            }
        )
        meta = {}
        try:
            meta = json.loads(r.text)
            return meta.get('metadata', []).pop()
        except:
            return meta

    def has_more(self, q):
        if True == q or 'True' == q or 'true' == q:
            return True
        return False

    def run(self):
        self.get_favgalid()

        r = self.oauth.request(
            self.url,
            self.params
        )

        js = json.loads(r.text)
        favs = js.get('results', [])
        has_more = self.has_more(js.get('has_more'))
        offset = js.get('next_offset')
        while True == has_more:
            logging.info('iterating over DA results with offset %d', offset)
            paged = self.getpaged(offset)
            new = paged.get('results', [])
            if not len(new):
                #logging.error('empty results from deviantART, breaking loop')
                break
            favs = favs + new
            has_more = self.has_more(paged.get('has_more'))
            if not has_more:
                break
            n = int(paged.get('next_offset'))
            if not n:
                break
            offset = offset + n

        self.favs = favs
        for fav in self.favs:
            f = DAFav(fav)
            if f.exists:
                continue

            f.fav.update({'meta': self.getsinglemeta(fav.get('deviationid'))})
            f.run()
            f.write()

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Parameters for NASG')
    parser.add_argument(
        '--loglevel',
        default='error',
        help='change loglevel'
    )

    params = vars(parser.parse_args())

    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])

    logging.basicConfig(
        level=shared.LLEVEL[params.get('loglevel')],
        format='%(asctime)s - %(levelname)s - %(message)s'
    )

    flickr = FlickrFavs()
    flickr.run()

    #hn = HNBookmarks()
    #hn.run()

    fivehpx = FivehpxFavs()
    fivehpx.run()

    tumblr = TumblrFavs()
    tumblr.run()

    da = DAFavs()
    da.run()