- now checking images against google vision api (why google, despite my despise of google, the company): vision api is the only one which is simple enough to use and their labelling is reasonable. - checking text against google natural language api: the strict classification it offers is better, than free folksonomy, if I ever want to connect entries based on topic unfortunately they don't support Hungarian yet.
Peter Molnar hello@petermolnar.eu
Tue, 11 Dec 2018 14:06:18 +0000
8 files changed,
320 insertions(+),
206 deletions(-)
M
.gitignore
→
.gitignore
@@ -5,4 +5,3 @@ .idea
lib gcloud.json tests/.Exif.tests.jpg.json -Pipfile.lock
D
exiftool.py
@@ -1,202 +0,0 @@
-__author__ = "Peter Molnar" -__copyright__ = "Copyright 2017-2018, Peter Molnar" -__license__ = "apache-2.0" -__maintainer__ = "Peter Molnar" -__email__ = "mail@petermolnar.net" - -import re -import subprocess -import json -import os -import keys -import requests - -from pprint import pprint - -EXIFDATE = re.compile( - r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+' - r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$' -) - -class CachedMeta(dict): - def __init__(self, fpath): - self.fpath = fpath - - @property - def cfile(self): - return os.path.join( - os.path.dirname(self.fpath), - ".%s.%s.json" % ( - self.__class__.__name__, - os.path.basename(self.fpath) - ) - ) - - @property - def _is_cached(self): - if os.path.exists(self.cfile): - mtime = os.path.getmtime(self.fpath) - ctime = os.path.getmtime(self.cfile) - if ctime >= mtime: - return True - return False - - def _read(self): - if not self._is_cached: - self._call_tool() - self._cache_update() - else: - self._cache_read() - - def _cache_update(self): - with open(self.cfile, 'wt') as f: - f.write(json.dumps(self, indent=4, sort_keys=True)) - - def _cache_read(self): - with open(self.cfile, 'rt') as f: - data = json.loads(f.read()) - for k, v in data.items(): - self[k] = v - -class GoogleVision(CachedMeta): - def __init__(self, fpath, imgurl): - self.fpath = fpath - self.imgurl = imgurl - self._read() - - @property - def cntr(self): - curr = 0 - if os.path.exists('/tmp/visionapicallcounter'): - with open('/tmp/visionapicallcounter', 'rt') as f: - curr = int(f.read()) - curr = curr + 1 - with open('/tmp/visionapicallcounter', 'wt') as f: - f.write("%d" % curr) - return curr - - def _call_tool(self): - if (self.cntr >= 500 ): - raise ValueError('already at 500 requests!') - - params = { - "requests": [ - { - "image": { - "source": { - "imageUri": self.imgurl, - } - }, - "features": [ - { - "type": "LANDMARK_DETECTION", - }, - { - "type": "LABEL_DETECTION", - }, - ] - } - ] - } - - url = "https://vision.googleapis.com/v1/images:annotate?key=%s" % (keys.gcloud.get('key')) - r = requests.post(url, json=params) - try: - resp = r.json() - resp = resp['responses'][0] - for k, v in resp.items(): - self[k] = v - except Exception as e: - logging.error('failed to call Google Vision API on: %s, reason: %s', self.fpath, e) - -class Exif(CachedMeta): - def __init__(self, fpath): - self.fpath = fpath - self._read() - - def _call_tool(self): - """ - Why like this: the # on some of the params forces exiftool to - display values like decimals, so the latitude / longitude params - can be used and parsed in a sane way - - If only -json is passed, it gets everything nicely, but in the default - format, which would require another round to parse - - """ - cmd = ( - "exiftool", - '-sort', - '-json', - '-MIMEType', - '-FileType', - '-FileName', - '-FileSize#', - '-ModifyDate', - '-CreateDate', - '-DateTimeOriginal', - '-ImageHeight', - '-ImageWidth', - '-Aperture', - '-FOV', - '-ISO', - '-FocalLength', - '-FNumber', - '-FocalLengthIn35mmFormat', - '-ExposureTime', - '-Model', - '-GPSLongitude#', - '-GPSLatitude#', - '-LensID', - '-LensSpec', - '-Lens', - '-ReleaseDate', - '-Description', - '-Headline', - '-HierarchicalSubject', - '-Copyright', - '-Artist', - self.fpath - ) - - p = subprocess.Popen( - cmd, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - - stdout, stderr = p.communicate() - if stderr: - raise OSError("Error reading EXIF:\n\t%s\n\t%s", cmd, stderr) - - exif = json.loads(stdout.decode('utf-8').strip()).pop() - if 'ReleaseDate' in exif and 'ReleaseTime' in exif: - exif['DateTimeRelease'] = "%s %s" % ( - exif.get('ReleaseDate'), exif.get('ReleaseTime')[:8] - ) - del(exif['ReleaseDate']) - del(exif['ReleaseTime']) - - for k, v in exif.items(): - self[k] = self.exifdate2rfc(v) - - def exifdate2rfc(self, value): - """ converts and EXIF date string to RFC 3339 format - - :param value: EXIF date (2016:05:01 00:08:24) - :type arg1: str - :return: RFC 3339 string with UTC timezone 2016-05-01T00:08:24+00:00 - :rtype: str - """ - if not isinstance(value, str): - return value - match = EXIFDATE.match(value) - if not match: - return value - return "%s-%s-%sT%s+00:00" % ( - match.group('year'), - match.group('month'), - match.group('day'), - match.group('time') - )
A
meta.py
@@ -0,0 +1,270 @@
+__author__ = "Peter Molnar" +__copyright__ = "Copyright 2017-2018, Peter Molnar" +__license__ = "apache-2.0" +__maintainer__ = "Peter Molnar" +__email__ = "mail@petermolnar.net" + +import re +import subprocess +import json +import os +import keys +import requests +import logging + +from pprint import pprint + +EXIFDATE = re.compile( + r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+' + r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$' +) + +class CachedMeta(dict): + def __init__(self, fpath): + self.fpath = fpath + + @property + def cfile(self): + return os.path.join( + os.path.dirname(self.fpath), + ".%s.%s.json" % ( + self.__class__.__name__, + os.path.basename(self.fpath) + ) + ) + + @property + def _is_cached(self): + if os.path.exists(self.cfile): + mtime = os.path.getmtime(self.fpath) + ctime = os.path.getmtime(self.cfile) + if ctime >= mtime: + return True + return False + + def _read(self): + if not self._is_cached: + self._call_tool() + self._cache_update() + else: + self._cache_read() + + def _cache_update(self): + with open(self.cfile, 'wt') as f: + f.write(json.dumps(self, indent=4, sort_keys=True)) + + def _cache_read(self): + with open(self.cfile, 'rt') as f: + data = json.loads(f.read()) + for k, v in data.items(): + self[k] = v + +class GoogleClassifyText(CachedMeta): + def __init__(self, fpath, txt, lang='en'): + self.fpath = fpath + self.txt = txt + self.lang = lang + self._read() + + def _call_tool(self): + params = { + "document": { + "type": "PLAIN_TEXT", + "content": self.txt, + "language": self.lang, + } + } + + url = "https://language.googleapis.com/v1beta2/documents:classifyText?key=%s" % ( + keys.gcloud.get('key') + ) + logging.info('calling Google classidyText') + r = requests.post(url, json=params) + try: + resp = r.json() + for cat in resp.get('categories', []): + self[cat.get('name')] = cat.get('confidence') + except Exception as e: + logging.error( + 'failed to call Google Vision API on: %s, reason: %s', + self.fpath, + e + ) + +class GoogleVision(CachedMeta): + def __init__(self, fpath, imgurl): + self.fpath = fpath + self.imgurl = imgurl + self._read() + + @property + def response(self): + if 'responses' not in self: + return {} + if not len(self['responses']): + return {} + if 'labelAnnotations' not in self['responses'][0]: + return {} + return self['responses'][0] + + @property + def tags(self): + tags = [] + + if 'labelAnnotations' in self.response: + for label in self.response['labelAnnotations']: + tags.append(label['description']) + + if 'webDetection' in self.response: + if 'webEntities' in self.response['webDetection']: + for label in self.response['webDetection']['webEntities']: + tags.append(label['description']) + return tags + + @property + def landmark(self): + landmark = None + if 'landmarkAnnotations' in self.response: + if len(self.response['landmarkAnnotations']): + match = self.response['landmarkAnnotations'].pop() + landmark = { + 'name': match['description'], + 'latitude': match['locations'][0]['latLng']['latitude'], + 'longitude': match['locations'][0]['latLng']['longitude'] + } + return landmark + + @property + def onlinecopies(self): + copies = [] + if 'webDetection' in self.response: + if 'pagesWithMatchingImages' in self.response['webDetection']: + for match in self.response['webDetection']['pagesWithMatchingImages']: + copies.append(match['url']) + return copies + + def _call_tool(self): + params = { + "requests": [{ + "image": {"source": {"imageUri": self.imgurl}}, + "features": [ + { + "type": "LANDMARK_DETECTION", + }, + { + "type": "WEB_DETECTION", + }, + { + "type": "LABEL_DETECTION", + } + ] + }] + } + + url = "https://vision.googleapis.com/v1/images:annotate?key=%s" % ( + keys.gcloud.get('key') + ) + logging.info('calling Google Vision API for %s', self.fpath) + r = requests.post(url, json=params) + try: + resp = r.json() + for k, v in resp.items(): + self[k] = v + except Exception as e: + logging.error( + 'failed to call Google Vision API on: %s, reason: %s', + self.fpath, + e + ) + +class Exif(CachedMeta): + def __init__(self, fpath): + self.fpath = fpath + self._read() + + def _call_tool(self): + """ + Why like this: the # on some of the params forces exiftool to + display values like decimals, so the latitude / longitude params + can be used and parsed in a sane way + + If only -json is passed, it gets everything nicely, but in the default + format, which would require another round to parse + + """ + cmd = ( + "exiftool", + '-sort', + '-json', + '-MIMEType', + '-FileType', + '-FileName', + '-FileSize#', + '-ModifyDate', + '-CreateDate', + '-DateTimeOriginal', + '-ImageHeight', + '-ImageWidth', + '-Aperture', + '-FOV', + '-ISO', + '-FocalLength', + '-FNumber', + '-FocalLengthIn35mmFormat', + '-ExposureTime', + '-Model', + '-GPSLongitude#', + '-GPSLatitude#', + '-LensID', + '-LensSpec', + '-Lens', + '-ReleaseDate', + '-Description', + '-Headline', + '-HierarchicalSubject', + '-Copyright', + '-Artist', + self.fpath + ) + + p = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + stdout, stderr = p.communicate() + if stderr: + raise OSError("Error reading EXIF:\n\t%s\n\t%s", cmd, stderr) + + exif = json.loads(stdout.decode('utf-8').strip()).pop() + if 'ReleaseDate' in exif and 'ReleaseTime' in exif: + exif['DateTimeRelease'] = "%s %s" % ( + exif.get('ReleaseDate'), exif.get('ReleaseTime')[:8] + ) + del(exif['ReleaseDate']) + del(exif['ReleaseTime']) + + for k, v in exif.items(): + self[k] = self.exifdate2rfc(v) + + def exifdate2rfc(self, value): + """ converts and EXIF date string to RFC 3339 format + + :param value: EXIF date (2016:05:01 00:08:24) + :type arg1: str + :return: RFC 3339 string with UTC timezone 2016-05-01T00:08:24+00:00 + :rtype: str + """ + if not isinstance(value, str): + return value + match = EXIFDATE.match(value) + if not match: + return value + return "%s-%s-%sT%s+00:00" % ( + match.group('year'), + match.group('month'), + match.group('day'), + match.group('time') + )
M
nasg.py
→
nasg.py
@@ -30,7 +30,7 @@ from emoji import UNICODE_EMOJI
from slugify import slugify import requests from pandoc import Pandoc -from exiftool import Exif, GoogleVision +from meta import Exif, GoogleVision, GoogleClassifyText import settings import keys@@ -507,6 +507,16 @@ pass
return lang @property + def classification(self): + c = GoogleClassifyText(self.fpath, self.content, self.lang) + k = '/Arts & Entertainment/Visual Art & Design/Photographic & Digital Arts' + if self.is_photo and k not in c.keys(): + c.update({ + k : '1.0' + }) + return c + + @property def url(self): return "%s/%s/" % ( settings.site.get('url'),@@ -578,6 +588,7 @@ 'url': self.url,
'review': self.review, 'has_code': self.has_code, 'event': self.event, + 'classification': self.classification.keys() } if (self.is_photo): v.update({@@ -707,7 +718,8 @@ 'title': self.title,
'caption': self.caption, 'exif': self.exif, 'is_photo': self.is_photo, - 'is_mainimg': self.is_mainimg + 'is_mainimg': self.is_mainimg, + 'onlinecopies': self.onlinecopies } def __str__(self):@@ -717,8 +729,16 @@ tmpl = J2.get_template("%s.j2.html" % (self.__class__.__name__))
return tmpl.render(self.tmplvars) @cached_property - def vision(self): + def visionapi(self): return GoogleVision(self.fpath, self.src) + + @property + def onlinecopies(self): + copies = {} + for m in self.visionapi.onlinecopies: + if settings.site.get('domain') not in m: + copies[m] = True + return copies.keys() @cached_property def meta(self):
M
settings.py
→
settings.py
@@ -60,6 +60,7 @@ 'flickr': 'https://flickr.com/people/petermolnareu',
'github': 'https://github.com/petermolnar', 'instagram': 'https://www.instagram.com/petermolnarnet/', 'twitter': 'https://twitter.com/petermolnar', + 'micro.blog': 'https://micro.blog/petermolnar', } }
M
templates/WebImage.j2.html
→
templates/WebImage.j2.html
@@ -54,5 +54,12 @@ {{ exif.lens }}
</dd> </dl> {% endif %} +{% if onlinecopies|length > 1 %} +<ul> +{% for copy in onlinecopies %} + <li><a href="{{ copy }}">[{{ loop.index }}]</a></li> +{% endfor %} +</ul> +{% endif %} </figcaption> </figure>
M
templates/base.j2.html
→
templates/base.j2.html
@@ -218,6 +218,15 @@ {{ post.url }}
</a> </dd> + <dt>Classification</dt> + <dd> + <ul> + {% for c in post.classification %} + <li>{{ c }}</li> + {% endfor %} + </ul> + </dd> + <dt>License</dt> <dd class="license"> {% if post.licence == 'CC-BY-4.0' %}
M
templates/style.css
→
templates/style.css
@@ -133,10 +133,20 @@ input {
border-bottom: 3px solid #ccc; } +figcaption > ul, nav ul { list-style-type: none; margin: 0; padding: 0; +} + +figcaption > ul { + display:none; + text-align: right; +} + +figcaption ul li { + display: inline-block; } nav li {