adding pesos code and cleanups

This commit is contained in:
Peter Molnar 2017-06-12 14:17:29 +00:00
parent 8c097971a0
commit 6078096ee3
6 changed files with 558 additions and 149 deletions

374
nasg.py
View file

@ -25,7 +25,8 @@ import frontmatter
from slugify import slugify
import langdetect
import requests
from breadability.readable import Article
#from breadability.readable import Article
from newspaper import Article as newspaper3k
from whoosh import index
from whoosh import qparser
import jinja2
@ -34,6 +35,7 @@ import shared
from webmentiontools.send import WebmentionSend
from bleach import clean
from emoji import UNICODE_EMOJI
from bs4 import BeautifulSoup
def splitpath(path):
parts = []
@ -114,8 +116,8 @@ class Indexer(object):
]
content_remote = []
for url, offlinecopy in singular.offlinecopies.items():
content_remote.append("%s" % offlinecopy)
#for url, offlinecopy in singular.offlinecopies.items():
#content_remote.append("%s" % offlinecopy)
weight = 1
if singular.isbookmark:
@ -154,15 +156,13 @@ class Indexer(object):
def finish(self):
self.writer.commit()
class OfflineCopy(object):
def __init__(self, url):
self.url = url
self.fname = hashlib.sha1(url.encode('utf-8')).hexdigest()
self.targetdir = os.path.abspath(
shared.config.get('source', 'offlinecopiesdir')
)
self.fname = "%s.md" % slugify(re.sub(r"^https?://", "", url))[:200]
self.target = os.path.join(
self.targetdir,
shared.config.get('source', 'offlinecopiesdir'),
self.fname
)
self.fm = frontmatter.loads('')
@ -170,6 +170,10 @@ class OfflineCopy(object):
'url': self.url,
'date': arrow.utcnow().format("YYYY-MM-DDTHH:mm:ssZ"),
}
self.headers = requests.utils.default_headers()
self.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
def __repr__(self):
return self.fm.content
@ -183,6 +187,42 @@ class OfflineCopy(object):
with open(self.target, 'wt') as f:
f.write(frontmatter.dumps(self.fm))
@property
def archiveorgurl(self):
a = self.fetch(
"http://archive.org/wayback/available?url=%s" % self.url,
)
if not a:
return None
try:
a = json.loads(a.text)
return a.get(
'archived_snapshots', {}
).get(
'closest', {}
).get(
'url', None
)
except Exception as e:
logging.error("archive.org parsing failed: %s", e)
return None
def fetch(self, url):
try:
r = requests.get(
self.url,
allow_redirects=True,
timeout=60,
headers=self.headers
)
if r.status_code == requests.codes.ok:
return r
except Exception as e:
return None
def run(self):
if os.path.isfile(self.target):
with open(self.target) as f:
@ -190,39 +230,17 @@ class OfflineCopy(object):
return
logging.info("prepairing offline copy of %s", self.url)
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
})
r = self.fetch(self.url)
if not r:
r = self.fetch(self.archiveorgurl)
try:
r = requests.get(
self.url,
allow_redirects=True,
timeout=60,
headers=headers
)
except Exception as e:
logging.error("%s failed:\n%s", self.url, e)
self.write()
return
if r:
if r.url != self.url:
self.fm.metadata['realurl'] = r.url
self.fm.content = r.text
if r.status_code != requests.codes.ok:
logging.warning("%s returned %s", self.url, r.status_code)
self.write()
return
if not len(r.text):
logging.warning("%s was empty", self.url)
self.write()
return
doc = Article(r.text, url=self.url)
self.fm.metadata['title'] = doc._original_document.title
self.fm.metadata['realurl'] = r.url
self.fm.content = shared.Pandoc(False).convert(doc.readable)
self.write()
return
class Renderer(object):
@ -551,7 +569,7 @@ class WebImage(object):
self.alttext = ''
self.sizes = []
self.fallbacksize = int(shared.config.get('common','fallbackimg', fallback='720'))
self.cl = None
self.cl = ''
self.singleimage = False
for size in shared.config.options('downsize'):
@ -587,35 +605,118 @@ class WebImage(object):
)
def __str__(self):
if self.is_downsizeable and not self.cl:
uphoto = ''
if self.singleimage:
uphoto = ' u-photo'
return '\n<figure class="photo"><a target="_blank" class="adaptive%s" href="%s"><img src="%s" class="adaptimg" alt="%s" /></a><figcaption class=\"caption\">%s%s</figcaption></figure>\n' % (
uphoto,
if self.is_downsizeable:
if self.singleimage and not self.cl:
self.cl = '.u-photo'
elif self.singleimage:
self.cl = '.u-photo %s' % self.cl
return '[![%s](%s "%s%s"){.adaptimg}](%s){.adaptive %s}' % (
self.alttext,
self.fallback,
self.fname,
self.ext,
self.target,
self.fallback,
self.alttext,
self.fname,
self.ext
self.cl
)
elif self.cl:
self.cl = self.cl.replace('.', ' ')
return '<img src="%s" class="%s" alt="%s" title="%s%s" />' % (
self.fallback,
self.cl,
else:
if not self.cl:
self.cl = '.aligncenter'
return '![%s](%s "%s%s"){%s}' % (
self.alttext,
self.fallback,
self.fname,
self.ext
self.ext,
self.cl
)
else:
return '<img src="%s" class="aligncenter" alt="%s" title="%s%s" />' % (
self.fallback,
self.alttext,
self.fname,
self.ext
)
@property
def exif(self):
if not self.is_photo:
return {}
if hasattr(self, '_exif'):
return self._exif
exif = {}
mapping = {
'camera': [
'Model'
],
'aperture': [
'FNumber',
'Aperture'
],
'shutter_speed': [
'ExposureTime'
],
'focallength35mm': [
'FocalLengthIn35mmFormat',
],
'focallength': [
'FocalLength',
],
'iso': [
'ISO'
],
'lens': [
'LensID',
],
'date': [
'CreateDate',
'DateTimeOriginal',
],
'geo_latitude': [
'GPSLatitude'
],
'geo_longitude': [
'GPSLongitude'
],
}
for ekey, candidates in mapping.items():
for candidate in candidates:
maybe = self.meta.get(candidate, None)
if maybe:
if 'geo_' in ekey:
exif[ekey] = round(float(maybe), 5)
else:
exif[ekey] = maybe
break
self._exif = exif
return self._exif
#def __str__(self):
#if self.is_downsizeable and not self.cl:
#uphoto = ''
#if self.singleimage:
#uphoto = ' u-photo'
#return '\n<figure class="photo"><a target="_blank" class="adaptive%s" href="%s"><img src="%s" class="adaptimg" alt="%s" /></a><figcaption class=\"caption\">%s%s</figcaption></figure>\n' % (
#uphoto,
#self.target,
#self.fallback,
#self.alttext,
#self.fname,
#self.ext
#)
#elif self.cl:
#self.cl = self.cl.replace('.', ' ')
#return '<img src="%s" class="%s" alt="%s" title="%s%s" />' % (
#self.fallback,
#self.cl,
#self.alttext,
#self.fname,
#self.ext
#)
#else:
#return '<img src="%s" class="aligncenter" alt="%s" title="%s%s" />' % (
#self.fallback,
#self.alttext,
#self.fname,
#self.ext
#)
@property
def rssenclosure(self):
@ -869,6 +970,9 @@ class Taxonomy(BaseIter):
return "%s/%d/index.html" % (self.pagep, page)
async def render(self, renderer):
if not self.slug or self.slug is 'None':
return
self.__mkdirs()
page = 1
testpath = self.tpath(page)
@ -907,7 +1011,8 @@ class Taxonomy(BaseIter):
'taxonomy': self.taxonomy,
'paged': page,
'total': self.pages,
'perpage': pagination
'perpage': pagination,
'lastmod': arrow.get(self.mtime).datetime
},
'site': renderer.sitevars,
'posts': posttmpls,
@ -1100,12 +1205,41 @@ class Singular(BaseRenderable):
def __parse(self):
with open(self.path, mode='rt') as f:
self.meta, self.content = frontmatter.parse(f.read())
self.__filter_images()
self.__filter_favs()
self.__filter_images()
if self.isphoto:
self.content = "%s\n%s" % (
self.content,
self.photo
)
# REMOVE THIS
trigger = self.offlinecopies
def __filter_favs(self):
url = self.meta.get('favorite-of',
self.meta.get('like-of',
self.meta.get('bookmark-of',
False
)
)
)
img = self.meta.get('image', False)
if not img:
return
if not url:
return
c = '[![%s](/%s/%s)](%s){.favurl}' % (
self.title,
shared.config.get('source', 'files'),
img,
url
)
if self.isbookmark:
c = "%s\n\n%s" % (c, self.content)
self.content = c
def __filter_images(self):
linkto = False
@ -1191,6 +1325,8 @@ class Singular(BaseRenderable):
'bookmark-of': 'bookmark',
'repost-of': 'repost',
'in-reply-to': 'reply',
'favorite-of': 'fav',
'like-of': 'like',
}
reactions = {}
@ -1281,6 +1417,25 @@ class Singular(BaseRenderable):
def isbookmark(self):
return self.meta.get('bookmark-of', False)
@property
def isreply(self):
return self.meta.get('in-reply-to', False)
# TODO
#@property
#def isrvsp(self):
# r'<data class="p-rsvp" value="([^"])">([^<]+)</data>'
@property
def isfav(self):
r = False
for maybe in ['like-of', 'favorite-of']:
maybe = self.meta.get(maybe, False)
if maybe:
r = maybe
break
return r
@property
def ispage(self):
if not self.meta:
@ -1289,7 +1444,11 @@ class Singular(BaseRenderable):
@property
def isonfront(self):
if self.ispage or self.isbookmark:
if self.ispage:
return False
if self.isbookmark:
return False
if self.isfav:
return False
return True
@ -1366,59 +1525,9 @@ class Singular(BaseRenderable):
@property
def exif(self):
if not self.isphoto:
return None
return {}
if hasattr(self, '_exif'):
return self._exif
exif = {}
mapping = {
'camera': [
'Model'
],
'aperture': [
'FNumber',
'Aperture'
],
'shutter_speed': [
'ExposureTime'
],
'focallength35mm': [
'FocalLengthIn35mmFormat',
],
'focallength': [
'FocalLength',
],
'iso': [
'ISO'
],
'lens': [
'LensID',
],
'date': [
'CreateDate',
'DateTimeOriginal',
],
'geo_latitude': [
'GPSLatitude'
],
'geo_longitude': [
'GPSLongitude'
],
}
for ekey, candidates in mapping.items():
for candidate in candidates:
maybe = self.photo.meta.get(candidate, None)
if maybe:
if 'geo_' in ekey:
exif[ekey] = round(float(maybe), 5)
else:
exif[ekey] = maybe
break
self._exif = exif
return self._exif
return self.photo.exif
@property
def rssenclosure(self):
@ -1441,7 +1550,8 @@ class Singular(BaseRenderable):
'category': self.category,
'reactions': self.reactions,
'updated': self.updated.datetime,
'summary': self.sumhtml,
'summary': self.summary,
'sumhtml': self.sumhtml,
'exif': self.exif,
'lang': self.lang,
'syndicate': '',
@ -1459,21 +1569,9 @@ class Singular(BaseRenderable):
def shortslug(self):
if hasattr(self, '_shortslug'):
return self._shortslug
self._shortslug = self.baseN(self.pubtime)
self._shortslug = shared.baseN(self.pubtime)
return self._shortslug
@staticmethod
def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"):
""" Used to create short, lowecase slug for a number (an epoch) passed """
num = int(num)
return ((num == 0) and numerals[0]) or (
Singular.baseN(
num // b,
b,
numerals
).lstrip(numerals[0]) + numerals[num % b]
)
async def rendercomments(self, renderer):
for comment in self.comments:
await comment.render(renderer)
@ -1507,9 +1605,6 @@ class Singular(BaseRenderable):
logging.debug('%s exists and up-to-date (lastmod: %d)', target, ttime)
return
#if not os.path.isdir(targetdir):
#os.mkdir(targetdir)
tmplvars = {
'post': self.tmplvars,
'site': renderer.sitevars,
@ -1517,11 +1612,6 @@ class Singular(BaseRenderable):
}
r = renderer.j2.get_template(self.tmplfile).render(tmplvars)
self.writerendered(target, r, mtime)
#with open(target, "w") as html:
#logging.debug('writing %s', target)
#html.write(r)
#html.close()
#os.utime(target, (mtime, mtime))
async def ping(self, pinger):
@ -1542,7 +1632,11 @@ class Singular(BaseRenderable):
logging.info("sending webmention from %s to %s", self.url, target)
ws = WebmentionSend(self.url, target)
ws.send(allow_redirects=True, timeout=30)
try:
ws.send(allow_redirects=True, timeout=30)
except Exception as e:
logging.error('ping failed to %s', target)
pinger.db[h] = record
class Webmentioner(object):

4
new.py
View file

@ -9,8 +9,6 @@ import glob
import sys
import tempfile
from slugify import slugify
import nasg
import shared
if __name__ == '__main__':
@ -78,7 +76,7 @@ if __name__ == '__main__':
elif args['repost']:
slug = slugify("re: %s" % (args['repost']), only_ascii=True, lower=True)
else:
slug = nasg.Singular.baseN(now.timestamp)
slug = shared.baseN(now.timestamp)
args['slug'] = input('Slug [%s]: ' % (slug)) or slug
if args['slug'] in slugs:

302
pesos.py Normal file
View file

@ -0,0 +1,302 @@
import json
import os
import hashlib
import glob
import frontmatter
import requests
import shared
import logging
import re
import shutil
import arrow
import bs4
from slugify import slugify
from pprint import pprint
class Bookmark(object):
def __init__(self, title, url, fname=None):
self.fm = frontmatter.loads('')
fname = fname or slugify(title)
self.fname = "%s.md" % fname
self.target = os.path.join(
shared.config.get('source', 'contentdir'),
shared.config.get('source', 'bookmarks'),
self.fname
)
self.fm.metadata = {
'published': arrow.utcnow().format(shared.ARROWISO),
'title': title,
'bookmark-of': url,
}
def write(self):
logging.info('saving bookmark to %s', self.target)
with open(self.target, 'wt') as t:
t.write(frontmatter.dumps(self.fm))
class HNBookmarks(object):
prefix = 'hn-'
def __init__(self):
self.url = 'https://news.ycombinator.com/favorites?id=%s' % (
shared.config.get('hackernews', 'user_id')
)
@property
def existing(self):
if hasattr(self, '_existing'):
return self._existing
d = os.path.join(
shared.config.get('source', 'contentdir'),
"*",
"%s*.md" % self.prefix
)
files = reversed(sorted(glob.glob(d)))
self._existing = [
os.path.basename(f.replace(self.prefix, '').replace('.md', ''))
for f in files
]
return self._existing
def run(self):
r = requests.get(self.url)
soup = bs4.BeautifulSoup(r.text, "html5lib")
rows = soup.find_all('tr', attrs={'class':'athing' })
for row in rows:
rid = row.get('id')
if rid in self.existing:
continue
link = row.find('a', attrs={'class':'storylink' })
url = link.get('href')
title = " ".join(link.contents)
fname = "%s%s" % (self.prefix, rid)
bookmark = Bookmark(title, url, fname)
bookmark.write()
class Fav(object):
def __init__(self):
self.arrow = arrow.utcnow()
self.fm = frontmatter.loads('')
@property
def target(self):
return os.path.join(
shared.config.get('source', 'contentdir'),
shared.config.get('source', 'favs'),
self.fname
)
@property
def exists(self):
return False
#return os.path.isfile(self.target)
@property
def imgname(self):
# the _ is to differentiate between my photos, where the md and jpg name is the same, and favs
return self.fname.replace('.md', '_.jpg')
@property
def imgtarget(self):
return os.path.join(
shared.config.get('source', 'filesdir'),
self.imgname
)
def saveimg(self, url):
target = self.imgtarget
if os.path.isfile(target):
logging.error("%s already exists, refusing to overwrite", target)
return
logging.info("pulling image %s to files", url)
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(target, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
def write(self):
logging.info('saving fav to %s', self.target)
with open(self.target, 'wt') as t:
t.write(frontmatter.dumps(self.fm))
os.utime(self.target, (self.arrow.timestamp, self.arrow.timestamp))
class FlickrFav(Fav):
def __init__(self, photo):
super(FlickrFav, self).__init__()
self.photo = photo
self.ownerid = photo.get('owner')
self.photoid = photo.get('id')
self.fname = "flickr-%s-%s.md" % (self.ownerid, self.photoid)
self.url = "https://www.flickr.com/photos/%s/%s" % (self.ownerid, self.photoid)
def run(self):
img = self.photo.get('url_b', self.photo.get('url_z', False))
if not img:
logging.error("image url was empty for %s, skipping fav", self.url)
return
self.saveimg(img)
self.arrow = arrow.get(
self.photo.get('date_faved', arrow.utcnow().timestamp)
)
self.fm.metadata = {
'published': self.arrow.format(shared.ARROWISO),
'title': '%s' % self.photo.get('title', self.fname),
'favorite-of': self.url,
'flickr_tags': self.photo.get('tags', '').split(' '),
'geo': {
'latitude': self.photo.get('latitude', ''),
'longitude': self.photo.get('longitude', ''),
},
'author': {
'name': self.photo.get('owner_name'),
'url': 'https://www.flickr.com/people/%s' % (
self.photo.get('owner')
),
},
'image': self.imgname
}
content = self.photo.get('description', {}).get('_content', '')
content = shared.Pandoc(False).convert(content)
self.fm.content = content
class FivehpxFav(Fav):
def __init__(self, photo):
super(FivehpxFav, self).__init__()
self.photo = photo
self.ownerid = photo.get('user_id')
self.photoid = photo.get('id')
self.fname = "500px-%s-%s.md" % (self.ownerid, self.photoid)
self.url = "https://www.500px.com%s" % (photo.get('url'))
def run(self):
img = self.photo.get('images')[0].get('url')
if not img:
logging.error("image url was empty for %s, skipping fav", self.url)
return
self.saveimg(img)
self.arrow = arrow.get(
self.photo.get('created_at', arrow.utcnow().timestamp)
)
self.fm.metadata = {
'published': self.arrow.format(shared.ARROWISO),
'title': '%s' % self.photo.get('name', self.fname),
'favorite-of': self.url,
'fivehpx_tags': self.photo.get('tags', []),
'geo': {
'latitude': self.photo.get('latitude', ''),
'longitude': self.photo.get('longitude', ''),
},
'author': {
'name': self.photo.get('user').get('fullname', self.ownerid),
'url': 'https://www.500px.com/%s' % (
self.photo.get('user').get('username', self.ownerid)
),
},
'image': self.imgname
}
content = self.photo.get('description', '')
if content:
content = shared.Pandoc(False).convert(content)
else:
content = ''
self.fm.content = content
class Favs(object):
def __init__(self, confgroup):
self.confgroup = confgroup
self.url = shared.config.get(confgroup, 'fav_api')
@property
def lastpulled(self):
return 0
mtime = 0
d = os.path.join(
shared.config.get('source', 'contentdir'),
shared.config.get('source', 'favs'),
"%s-*.md" % self.confgroup
)
files = glob.glob(d)
for f in files:
ftime = int(os.path.getmtime(f))
if ftime > mtime:
mtime = ftime
mtime = mtime + 1
logging.debug("last flickr fav timestamp: %s", mtime)
return mtime
class FlickrFavs(Favs):
def __init__(self):
super(FlickrFavs, self).__init__('flickr')
self.params = {
'method': 'flickr.favorites.getList',
'api_key': shared.config.get('flickr', 'api_key'),
'user_id': shared.config.get('flickr', 'user_id'),
'extras': 'description,geo,tags,url_z,url_b,owner_name,date_upload',
'per_page': 500,
'format': 'json',
'nojsoncallback': '1',
'min_fave_date': self.lastpulled
}
def run(self):
r = requests.get(self.url,params=self.params)
js = json.loads(r.text)
for photo in js.get('photos', {}).get('photo', []):
fav = FlickrFav(photo)
fav.run()
fav.write()
class FivehpxFavs(Favs):
def __init__(self):
super(FivehpxFavs, self).__init__('500px')
self.params = {
'consumer_key': shared.config.get('500px', 'api_key'),
'rpp': 100,
'image_size': 4,
'include_tags': 1,
'include_geo': 1
}
def run(self):
r = requests.get(self.url,params=self.params)
js = json.loads(r.text)
for photo in js.get('photos', []):
fav = FivehpxFav(photo)
if not fav.exists:
fav.run()
fav.write()
if __name__ == '__main__':
while len(logging.root.handlers) > 0:
logging.root.removeHandler(logging.root.handlers[-1])
logging.basicConfig(
level=20,
format='%(asctime)s - %(levelname)s - %(message)s'
)
flickr = FlickrFavs()
flickr.run()
hn = HNBookmarks()
hn.run()
fivehpx = FivehpxFavs()
fivehpx.run()

10
search.py Normal file → Executable file
View file

@ -1,9 +1,11 @@
#!/usr/bin/env python3
import os
#import sys
#sys.path.append(os.path.dirname(os.path.abspath(__file__)))
import asyncio
import uvloop
import os
from sanic import Sanic
import sanic.response
from sanic.log import log as logging
@ -66,8 +68,8 @@ if __name__ == '__main__':
jenv = jinja2.Environment(loader=jldr)
tmpl = jenv.get_template('searchresults.html')
@app.route("/search")
async def search(request, methods=["GET"]):
@app.route("/search", methods=["GET"])
async def search(request):
query = request.args.get('s')
r = SearchHandler(query, tmpl)
return r

View file

@ -25,6 +25,18 @@ def __expandconfig(config):
))
return config
def baseN(num, b=36, numerals="0123456789abcdefghijklmnopqrstuvwxyz"):
""" Used to create short, lowecase slug for a number (an epoch) passed """
num = int(num)
return ((num == 0) and numerals[0]) or (
baseN(
num // b,
b,
numerals
).lstrip(numerals[0]) + numerals[num % b]
)
ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ'
STRFISO = '%Y-%m-%dT%H:%M:%S%z'

5
webmention.py Normal file → Executable file
View file

@ -1,3 +1,5 @@
#!/usr/bin/env python3
import asyncio
import uvloop
import os
@ -111,8 +113,7 @@ class WebmentionHandler(object):
def _save(self):
target = os.path.join(
shared.config.get('source', 'commentsdir'),
self.mhash,
'.md'
"%s.md" % self.mhash
)
if os.path.isfile(target):