petermolnar's repositories — nasg: 26c6ef77edd9f7cef90175fd4abde2a31749779d

- now checking images against google vision api (why google, despite my despise of google, the company): vision api is the only one which is simple enough to use and their labelling is reasonable.
- checking text against google natural language api: the strict classification it offers is better, than free folksonomy, if I ever want to connect entries based on topic
unfortunately they don't support Hungarian yet.

Peter Molnar hello@petermolnar.eu

Tue, 11 Dec 2018 14:06:18 +0000

commit

26c6ef77edd9f7cef90175fd4abde2a31749779d

parent

033a00db8ec281d028c3ed615b9f0e05a73c91c0

8 files changed, 320 insertions(+), 206 deletions(-)

jump to

.gitignore

meta.py

nasg.py

settings.py

templates/WebImage.j2.html

templates/base.j2.html

templates/style.css

M .gitignore → .gitignore

@@ -5,4 +5,3 @@ .idea
 lib
 gcloud.json
 tests/.Exif.tests.jpg.json
-Pipfile.lock

D exiftool.py

@@ -1,202 +0,0 @@
-__author__ = "Peter Molnar"
-__copyright__ = "Copyright 2017-2018, Peter Molnar"
-__license__ = "apache-2.0"
-__maintainer__ = "Peter Molnar"
-__email__ = "mail@petermolnar.net"
-
-import re
-import subprocess
-import json
-import os
-import keys
-import requests
-
-from pprint import pprint
-
-EXIFDATE = re.compile(
-    r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+'
-    r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$'
-)
-
-class CachedMeta(dict):
-    def __init__(self, fpath):
-        self.fpath = fpath
-
-    @property
-    def cfile(self):
-        return os.path.join(
-            os.path.dirname(self.fpath),
-            ".%s.%s.json" % (
-                self.__class__.__name__,
-                os.path.basename(self.fpath)
-            )
-        )
-
-    @property
-    def _is_cached(self):
-        if os.path.exists(self.cfile):
-            mtime = os.path.getmtime(self.fpath)
-            ctime = os.path.getmtime(self.cfile)
-            if ctime >= mtime:
-                return True
-        return False
-
-    def _read(self):
-        if not self._is_cached:
-            self._call_tool()
-            self._cache_update()
-        else:
-            self._cache_read()
-
-    def _cache_update(self):
-        with open(self.cfile, 'wt') as f:
-            f.write(json.dumps(self, indent=4, sort_keys=True))
-
-    def _cache_read(self):
-        with open(self.cfile, 'rt') as f:
-            data = json.loads(f.read())
-            for k, v in data.items():
-                self[k] = v
-
-class GoogleVision(CachedMeta):
-    def __init__(self, fpath, imgurl):
-        self.fpath = fpath
-        self.imgurl = imgurl
-        self._read()
-
-    @property
-    def cntr(self):
-        curr = 0
-        if os.path.exists('/tmp/visionapicallcounter'):
-            with open('/tmp/visionapicallcounter', 'rt') as f:
-                curr = int(f.read())
-        curr = curr + 1
-        with open('/tmp/visionapicallcounter', 'wt') as f:
-            f.write("%d" % curr)
-        return curr
-
-    def _call_tool(self):
-        if (self.cntr >= 500 ):
-            raise ValueError('already at 500 requests!')
-
-        params = {
-          "requests": [
-            {
-              "image": {
-                "source": {
-                  "imageUri": self.imgurl,
-                }
-              },
-              "features": [
-                {
-                  "type": "LANDMARK_DETECTION",
-                },
-                {
-                  "type": "LABEL_DETECTION",
-                },
-              ]
-            }
-          ]
-        }
-
-        url = "https://vision.googleapis.com/v1/images:annotate?key=%s" % (keys.gcloud.get('key'))
-        r = requests.post(url, json=params)
-        try:
-            resp = r.json()
-            resp = resp['responses'][0]
-            for k, v in resp.items():
-                self[k] = v
-        except Exception as e:
-            logging.error('failed to call Google Vision API on: %s, reason: %s', self.fpath, e)
-
-class Exif(CachedMeta):
-    def __init__(self, fpath):
-        self.fpath = fpath
-        self._read()
-
-    def _call_tool(self):
-        """
-        Why like this: the # on some of the params forces exiftool to
-        display values like decimals, so the latitude / longitude params
-        can be used and parsed in a sane way
-
-        If only -json is passed, it gets everything nicely, but in the default
-        format, which would require another round to parse
-
-        """
-        cmd = (
-            "exiftool",
-            '-sort',
-            '-json',
-            '-MIMEType',
-            '-FileType',
-            '-FileName',
-            '-FileSize#',
-            '-ModifyDate',
-            '-CreateDate',
-            '-DateTimeOriginal',
-            '-ImageHeight',
-            '-ImageWidth',
-            '-Aperture',
-            '-FOV',
-            '-ISO',
-            '-FocalLength',
-            '-FNumber',
-            '-FocalLengthIn35mmFormat',
-            '-ExposureTime',
-            '-Model',
-            '-GPSLongitude#',
-            '-GPSLatitude#',
-            '-LensID',
-            '-LensSpec',
-            '-Lens',
-            '-ReleaseDate',
-            '-Description',
-            '-Headline',
-            '-HierarchicalSubject',
-            '-Copyright',
-            '-Artist',
-            self.fpath
-        )
-
-        p = subprocess.Popen(
-            cmd,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-
-        stdout, stderr = p.communicate()
-        if stderr:
-            raise OSError("Error reading EXIF:\n\t%s\n\t%s", cmd, stderr)
-
-        exif = json.loads(stdout.decode('utf-8').strip()).pop()
-        if 'ReleaseDate' in exif and 'ReleaseTime' in exif:
-            exif['DateTimeRelease'] = "%s %s" % (
-                exif.get('ReleaseDate'), exif.get('ReleaseTime')[:8]
-            )
-            del(exif['ReleaseDate'])
-            del(exif['ReleaseTime'])
-
-        for k, v in exif.items():
-            self[k] = self.exifdate2rfc(v)
-
-    def exifdate2rfc(self, value):
-        """ converts and EXIF date string to RFC 3339 format
-
-        :param value: EXIF date (2016:05:01 00:08:24)
-        :type arg1: str
-        :return: RFC 3339 string with UTC timezone 2016-05-01T00:08:24+00:00
-        :rtype: str
-        """
-        if not isinstance(value, str):
-            return value
-        match = EXIFDATE.match(value)
-        if not match:
-            return value
-        return "%s-%s-%sT%s+00:00" % (
-            match.group('year'),
-            match.group('month'),
-            match.group('day'),
-            match.group('time')
-        )

A meta.py

@@ -0,0 +1,270 @@
+__author__ = "Peter Molnar"
+__copyright__ = "Copyright 2017-2018, Peter Molnar"
+__license__ = "apache-2.0"
+__maintainer__ = "Peter Molnar"
+__email__ = "mail@petermolnar.net"
+
+import re
+import subprocess
+import json
+import os
+import keys
+import requests
+import logging
+
+from pprint import pprint
+
+EXIFDATE = re.compile(
+    r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+'
+    r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$'
+)
+
+class CachedMeta(dict):
+    def __init__(self, fpath):
+        self.fpath = fpath
+
+    @property
+    def cfile(self):
+        return os.path.join(
+            os.path.dirname(self.fpath),
+            ".%s.%s.json" % (
+                self.__class__.__name__,
+                os.path.basename(self.fpath)
+            )
+        )
+
+    @property
+    def _is_cached(self):
+        if os.path.exists(self.cfile):
+            mtime = os.path.getmtime(self.fpath)
+            ctime = os.path.getmtime(self.cfile)
+            if ctime >= mtime:
+                return True
+        return False
+
+    def _read(self):
+        if not self._is_cached:
+            self._call_tool()
+            self._cache_update()
+        else:
+            self._cache_read()
+
+    def _cache_update(self):
+        with open(self.cfile, 'wt') as f:
+            f.write(json.dumps(self, indent=4, sort_keys=True))
+
+    def _cache_read(self):
+        with open(self.cfile, 'rt') as f:
+            data = json.loads(f.read())
+            for k, v in data.items():
+                self[k] = v
+
+class GoogleClassifyText(CachedMeta):
+    def __init__(self, fpath, txt, lang='en'):
+        self.fpath = fpath
+        self.txt = txt
+        self.lang = lang
+        self._read()
+
+    def _call_tool(self):
+        params = {
+            "document": {
+                "type": "PLAIN_TEXT",
+                "content": self.txt,
+                "language": self.lang,
+            }
+        }
+
+        url = "https://language.googleapis.com/v1beta2/documents:classifyText?key=%s" % (
+            keys.gcloud.get('key')
+        )
+        logging.info('calling Google classidyText')
+        r = requests.post(url, json=params)
+        try:
+            resp = r.json()
+            for cat in resp.get('categories', []):
+                self[cat.get('name')] = cat.get('confidence')
+        except Exception as e:
+            logging.error(
+                'failed to call Google Vision API on: %s, reason: %s',
+                self.fpath,
+                e
+            )
+
+class GoogleVision(CachedMeta):
+    def __init__(self, fpath, imgurl):
+        self.fpath = fpath
+        self.imgurl = imgurl
+        self._read()
+
+    @property
+    def response(self):
+        if 'responses' not in self:
+            return {}
+        if not len(self['responses']):
+            return {}
+        if 'labelAnnotations' not in self['responses'][0]:
+            return {}
+        return self['responses'][0]
+
+    @property
+    def tags(self):
+        tags = []
+
+        if 'labelAnnotations' in self.response:
+            for label in self.response['labelAnnotations']:
+                tags.append(label['description'])
+
+        if 'webDetection' in self.response:
+            if 'webEntities' in self.response['webDetection']:
+                for label in self.response['webDetection']['webEntities']:
+                    tags.append(label['description'])
+        return tags
+
+    @property
+    def landmark(self):
+        landmark = None
+        if 'landmarkAnnotations' in self.response:
+            if len(self.response['landmarkAnnotations']):
+                match = self.response['landmarkAnnotations'].pop()
+                landmark = {
+                    'name': match['description'],
+                    'latitude': match['locations'][0]['latLng']['latitude'],
+                    'longitude': match['locations'][0]['latLng']['longitude']
+                }
+        return landmark
+
+    @property
+    def onlinecopies(self):
+        copies = []
+        if 'webDetection' in self.response:
+            if 'pagesWithMatchingImages' in self.response['webDetection']:
+                for match in self.response['webDetection']['pagesWithMatchingImages']:
+                    copies.append(match['url'])
+        return copies
+
+    def _call_tool(self):
+        params = {
+            "requests": [{
+                "image": {"source": {"imageUri": self.imgurl}},
+                "features": [
+                    {
+                      "type": "LANDMARK_DETECTION",
+                    },
+                    {
+                      "type": "WEB_DETECTION",
+                    },
+                    {
+                      "type": "LABEL_DETECTION",
+                    }
+                ]
+            }]
+        }
+
+        url = "https://vision.googleapis.com/v1/images:annotate?key=%s" % (
+            keys.gcloud.get('key')
+        )
+        logging.info('calling Google Vision API for %s', self.fpath)
+        r = requests.post(url, json=params)
+        try:
+            resp = r.json()
+            for k, v in resp.items():
+                self[k] = v
+        except Exception as e:
+            logging.error(
+                'failed to call Google Vision API on: %s, reason: %s',
+                self.fpath,
+                e
+            )
+
+class Exif(CachedMeta):
+    def __init__(self, fpath):
+        self.fpath = fpath
+        self._read()
+
+    def _call_tool(self):
+        """
+        Why like this: the # on some of the params forces exiftool to
+        display values like decimals, so the latitude / longitude params
+        can be used and parsed in a sane way
+
+        If only -json is passed, it gets everything nicely, but in the default
+        format, which would require another round to parse
+
+        """
+        cmd = (
+            "exiftool",
+            '-sort',
+            '-json',
+            '-MIMEType',
+            '-FileType',
+            '-FileName',
+            '-FileSize#',
+            '-ModifyDate',
+            '-CreateDate',
+            '-DateTimeOriginal',
+            '-ImageHeight',
+            '-ImageWidth',
+            '-Aperture',
+            '-FOV',
+            '-ISO',
+            '-FocalLength',
+            '-FNumber',
+            '-FocalLengthIn35mmFormat',
+            '-ExposureTime',
+            '-Model',
+            '-GPSLongitude#',
+            '-GPSLatitude#',
+            '-LensID',
+            '-LensSpec',
+            '-Lens',
+            '-ReleaseDate',
+            '-Description',
+            '-Headline',
+            '-HierarchicalSubject',
+            '-Copyright',
+            '-Artist',
+            self.fpath
+        )
+
+        p = subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+
+        stdout, stderr = p.communicate()
+        if stderr:
+            raise OSError("Error reading EXIF:\n\t%s\n\t%s", cmd, stderr)
+
+        exif = json.loads(stdout.decode('utf-8').strip()).pop()
+        if 'ReleaseDate' in exif and 'ReleaseTime' in exif:
+            exif['DateTimeRelease'] = "%s %s" % (
+                exif.get('ReleaseDate'), exif.get('ReleaseTime')[:8]
+            )
+            del(exif['ReleaseDate'])
+            del(exif['ReleaseTime'])
+
+        for k, v in exif.items():
+            self[k] = self.exifdate2rfc(v)
+
+    def exifdate2rfc(self, value):
+        """ converts and EXIF date string to RFC 3339 format
+
+        :param value: EXIF date (2016:05:01 00:08:24)
+        :type arg1: str
+        :return: RFC 3339 string with UTC timezone 2016-05-01T00:08:24+00:00
+        :rtype: str
+        """
+        if not isinstance(value, str):
+            return value
+        match = EXIFDATE.match(value)
+        if not match:
+            return value
+        return "%s-%s-%sT%s+00:00" % (
+            match.group('year'),
+            match.group('month'),
+            match.group('day'),
+            match.group('time')
+        )

M nasg.py → nasg.py

@@ -30,7 +30,7 @@ from emoji import UNICODE_EMOJI
 from slugify import slugify
 import requests
 from pandoc import Pandoc
-from exiftool import Exif, GoogleVision
+from meta import Exif, GoogleVision, GoogleClassifyText
 import settings
 import keys
 
@@ -507,6 +507,16 @@ pass
         return lang
 
     @property
+    def classification(self):
+        c = GoogleClassifyText(self.fpath, self.content, self.lang)
+        k = '/Arts & Entertainment/Visual Art & Design/Photographic & Digital Arts'
+        if self.is_photo and k not in c.keys():
+            c.update({
+                k : '1.0'
+            })
+        return c
+
+    @property
     def url(self):
         return "%s/%s/" % (
             settings.site.get('url'),
@@ -578,6 +588,7 @@ 'url': self.url,
             'review': self.review,
             'has_code': self.has_code,
             'event': self.event,
+            'classification': self.classification.keys()
         }
         if (self.is_photo):
             v.update({
@@ -707,7 +718,8 @@ 'title': self.title,
             'caption': self.caption,
             'exif': self.exif,
             'is_photo': self.is_photo,
-            'is_mainimg': self.is_mainimg
+            'is_mainimg': self.is_mainimg,
+            'onlinecopies': self.onlinecopies
         }
 
     def __str__(self):
@@ -717,8 +729,16 @@ tmpl = J2.get_template("%s.j2.html" % (self.__class__.__name__))
         return tmpl.render(self.tmplvars)
 
     @cached_property
-    def vision(self):
+    def visionapi(self):
         return GoogleVision(self.fpath, self.src)
+
+    @property
+    def onlinecopies(self):
+        copies = {}
+        for m in self.visionapi.onlinecopies:
+            if settings.site.get('domain') not in m:
+                copies[m] = True
+        return copies.keys()
 
     @cached_property
     def meta(self):

M settings.py → settings.py

@@ -60,6 +60,7 @@ 'flickr': 'https://flickr.com/people/petermolnareu',
         'github': 'https://github.com/petermolnar',
         'instagram': 'https://www.instagram.com/petermolnarnet/',
         'twitter': 'https://twitter.com/petermolnar',
+        'micro.blog': 'https://micro.blog/petermolnar',
     }
 }

M templates/WebImage.j2.html → templates/WebImage.j2.html

@@ -54,5 +54,12 @@ {{ exif.lens }}
             </dd>
         </dl>
 {% endif %}
+{% if onlinecopies|length > 1 %}
+<ul>
+{% for copy in onlinecopies %}
+    <li><a href="{{ copy }}">[{{ loop.index }}]</a></li>
+{% endfor %}
+</ul>
+{% endif %}
     </figcaption>
 </figure>

M templates/base.j2.html → templates/base.j2.html

@@ -218,6 +218,15 @@ {{ post.url }}
                     </a>
                 </dd>
 
+                <dt>Classification</dt>
+                <dd>
+                    <ul>
+                    {% for c in post.classification %}
+                        <li>{{ c }}</li>
+                    {% endfor %}
+                    </ul>
+                </dd>
+
                 <dt>License</dt>
                 <dd class="license">
                 {% if post.licence == 'CC-BY-4.0' %}

M templates/style.css → templates/style.css

@@ -133,10 +133,20 @@ input {
   border-bottom: 3px solid #ccc;
 }
 
+figcaption > ul,
 nav ul {
   list-style-type: none;
   margin: 0;
   padding: 0;
+}
+
+figcaption > ul {
+  display:none;
+  text-align: right;
+}
+
+figcaption ul li {
+  display: inline-block;
 }
 
 nav li {