nasg/meta.py

__author__ = "Peter Molnar"
__copyright__ = "Copyright 2017-2018, Peter Molnar"
__license__ = "apache-2.0"
__maintainer__ = "Peter Molnar"
__email__ = "mail@petermolnar.net"

import re
import subprocess
import json
import os
import logging
import requests
import keys
import settings

from pprint import pprint

EXIFDATE = re.compile(
    r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+'
    r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$'
)

class CachedMeta(dict):
    def __init__(self, fpath):
        self.fpath = fpath

    @property
    def cfile(self):
        fname = os.path.basename(self.fpath)
        if fname  == 'index.md':
            fname = os.path.basename(os.path.dirname(self.fpath))

        return os.path.join(
            settings.paths.get('tmp', 'tmp'),
            "%s.%s.json" % (
                fname,
                self.__class__.__name__,
            )
        )

    @property
    def _is_cached(self):
        if os.path.exists(self.cfile):
            mtime = os.path.getmtime(self.fpath)
            ctime = os.path.getmtime(self.cfile)
            if ctime >= mtime:
                return True
        return False

    def _read(self):
        if not self._is_cached:
            self._call_tool()
            self._cache_update()
        else:
            self._cache_read()

    def _cache_update(self):
        with open(self.cfile, 'wt') as f:
            logging.debug(
                "writing cached meta file of %s to %s",
                self.fpath,
                self.cfile
            )
            f.write(json.dumps(self, indent=4, sort_keys=True))

    def _cache_read(self):
        with open(self.cfile, 'rt') as f:
            data = json.loads(f.read())
            for k, v in data.items():
                self[k] = v

class GoogleClassifyText(CachedMeta):
    def __init__(self, fpath, txt, lang='en'):
        self.fpath = fpath
        self.txt = txt
        self.lang = lang
        self._read()

    def _call_tool(self):
        params = {
            "document": {
                "type": "PLAIN_TEXT",
                "content": self.txt,
                "language": self.lang,
            }
        }

        url = "https://language.googleapis.com/v1beta2/documents:classifyText?key=%s" % (
            keys.gcloud.get('key')
        )
        logging.info(
            "calling Google classifyText for %s",
            self.fpath
        )
        r = requests.post(url, json=params)
        try:
            resp = r.json()
            for cat in resp.get('categories', []):
                self[cat.get('name')] = cat.get('confidence')
        except Exception as e:
            logging.error(
                'failed to call Google Vision API on: %s, reason: %s',
                self.fpath,
                e
            )

class GoogleVision(CachedMeta):
    def __init__(self, fpath, imgurl):
        self.fpath = fpath
        self.imgurl = imgurl
        self._read()

    @property
    def response(self):
        if 'responses' not in self:
            return {}
        if not len(self['responses']):
            return {}
        if 'labelAnnotations' not in self['responses'][0]:
            return {}
        return self['responses'][0]

    @property
    def tags(self):
        tags = []

        if 'labelAnnotations' in self.response:
            for label in self.response['labelAnnotations']:
                tags.append(label['description'])

        if 'webDetection' in self.response:
            if 'webEntities' in self.response['webDetection']:
                for label in self.response['webDetection']['webEntities']:
                    tags.append(label['description'])
        return tags

    @property
    def landmark(self):
        landmark = None
        if 'landmarkAnnotations' in self.response:
            if len(self.response['landmarkAnnotations']):
                match = self.response['landmarkAnnotations'].pop()
                landmark = {
                    'name': match['description'],
                    'latitude': match['locations'][0]['latLng']['latitude'],
                    'longitude': match['locations'][0]['latLng']['longitude']
                }
        return landmark

    @property
    def onlinecopies(self):
        copies = []
        if 'webDetection' in self.response:
            if 'pagesWithMatchingImages' in self.response['webDetection']:
                for match in self.response['webDetection']['pagesWithMatchingImages']:
                    copies.append(match['url'])
        return copies

    def _call_tool(self):
        params = {
            "requests": [{
                "image": {"source": {"imageUri": self.imgurl}},
                "features": [
                    {
                      "type": "LANDMARK_DETECTION",
                    },
                    {
                      "type": "WEB_DETECTION",
                    },
                    {
                      "type": "LABEL_DETECTION",
                    }
                ]
            }]
        }

        url = "https://vision.googleapis.com/v1/images:annotate?key=%s" % (
            keys.gcloud.get('key')
        )
        logging.info(
            "calling Google Vision for %s",
            self.fpath
        )
        r = requests.post(url, json=params)
        try:
            resp = r.json()
            for k, v in resp.items():
                self[k] = v
        except Exception as e:
            logging.error(
                'failed to call Google Vision API on: %s, reason: %s',
                self.fpath,
                e
            )

class Exif(CachedMeta):
    def __init__(self, fpath):
        self.fpath = fpath
        self._read()

    def _call_tool(self):
        """
        Why like this: the # on some of the params forces exiftool to
        display values like decimals, so the latitude / longitude params
        can be used and parsed in a sane way

        If only -json is passed, it gets everything nicely, but in the default
        format, which would require another round to parse

        """
        cmd = (
            "exiftool",
            '-sort',
            '-json',
            '-MIMEType',
            '-FileType',
            '-FileName',
            '-FileSize#',
            '-ModifyDate',
            '-CreateDate',
            '-DateTimeOriginal',
            '-ImageHeight',
            '-ImageWidth',
            '-Aperture',
            '-FOV',
            '-ISO',
            '-FocalLength',
            '-FNumber',
            '-FocalLengthIn35mmFormat',
            '-ExposureTime',
            '-Model',
            '-GPSLongitude#',
            '-GPSLatitude#',
            '-LensID',
            '-LensSpec',
            '-Lens',
            '-ReleaseDate',
            '-Description',
            '-Headline',
            '-HierarchicalSubject',
            '-Copyright',
            '-Artist',
            self.fpath
        )

        p = subprocess.Popen(
            cmd,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )

        stdout, stderr = p.communicate()
        if stderr:
            raise OSError("Error reading EXIF:\n\t%s\n\t%s", cmd, stderr)

        exif = json.loads(stdout.decode('utf-8').strip()).pop()
        if 'ReleaseDate' in exif and 'ReleaseTime' in exif:
            exif['DateTimeRelease'] = "%s %s" % (
                exif.get('ReleaseDate'), exif.get('ReleaseTime')[:8]
            )
            del(exif['ReleaseDate'])
            del(exif['ReleaseTime'])

        for k, v in exif.items():
            self[k] = self.exifdate2rfc(v)

    def exifdate2rfc(self, value):
        """ converts and EXIF date string to RFC 3339 format

        :param value: EXIF date (2016:05:01 00:08:24)
        :type arg1: str
        :return: RFC 3339 string with UTC timezone 2016-05-01T00:08:24+00:00
        :rtype: str
        """
        if not isinstance(value, str):
            return value
        match = EXIFDATE.match(value)
        if not match:
            return value
        return "%s-%s-%sT%s+00:00" % (
            match.group('year'),
            match.group('month'),
            match.group('day'),
            match.group('time')
        )
After long discussions, mainly listening and reading, I'm giving up on GPL licencing and moving to Apache 2.0. The short summary is that while I still sort of believe in what GPL stands for, reality is not that simple to immediately open source everything. Because GPL is scary, many people avoid it, and one of the main achievements of open source should be that nobody has to reinvent the wheel. 2018-12-03 10:36:10 +00:00			`__author__ = "Peter Molnar"`
			`__copyright__ = "Copyright 2017-2018, Peter Molnar"`
			`__license__ = "apache-2.0"`
			`__maintainer__ = "Peter Molnar"`
			`__email__ = "mail@petermolnar.net"`

v4.0a 2018-07-20 16:45:42 +01:00			`import re`
			`import subprocess`
			`import json`
			`import os`
- now checking images against google vision api (why google, despite my despise of google, the company): vision api is the only one which is simple enough to use and their labelling is reasonable. - checking text against google natural language api: the strict classification it offers is better, than free folksonomy, if I ever want to connect entries based on topic unfortunately they don't support Hungarian yet. 2018-12-11 14:06:18 +00:00			`import logging`
re-adding immediate async processing; is_page added to hide a few footer metatada in page type posts; unused symbols removed, but assets updated; footer extended with a lot of extra information; kcl action replaced; 2018-12-27 19:48:06 +00:00			`import requests`
			`import keys`
			`import settings`
- moved Tips from footer to bottom of each singular with an "encourage creation" header - google vision api binding for images (not in use yet) - konami code css preparations - experimental gallery css - minor code cleanups 2018-12-01 10:43:13 +00:00
			`from pprint import pprint`
v4.0a 2018-07-20 16:45:42 +01:00
			`EXIFDATE = re.compile(`
			`r'^(?P<year>[0-9]{4}):(?P<month>[0-9]{2}):(?P<day>[0-9]{2})\s+'`
			`r'(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2})$'`
			`)`

- moved Tips from footer to bottom of each singular with an "encourage creation" header - google vision api binding for images (not in use yet) - konami code css preparations - experimental gallery css - minor code cleanups 2018-12-01 10:43:13 +00:00			`class CachedMeta(dict):`
v4.0a 2018-07-20 16:45:42 +01:00			`def __init__(self, fpath):`
			`self.fpath = fpath`

			`@property`
			`def cfile(self):`
re-adding immediate async processing; is_page added to hide a few footer metatada in page type posts; unused symbols removed, but assets updated; footer extended with a lot of extra information; kcl action replaced; 2018-12-27 19:48:06 +00:00			`fname = os.path.basename(self.fpath)`
			`if fname == 'index.md':`
			`fname = os.path.basename(os.path.dirname(self.fpath))`

v4.0a 2018-07-20 16:45:42 +01:00			`return os.path.join(`
re-adding immediate async processing; is_page added to hide a few footer metatada in page type posts; unused symbols removed, but assets updated; footer extended with a lot of extra information; kcl action replaced; 2018-12-27 19:48:06 +00:00			`settings.paths.get('tmp', 'tmp'),`
			`"%s.%s.json" % (`
			`fname,`
- moved Tips from footer to bottom of each singular with an "encourage creation" header - google vision api binding for images (not in use yet) - konami code css preparations - experimental gallery css - minor code cleanups 2018-12-01 10:43:13 +00:00			`self.__class__.__name__,`
			`)`
v4.0a 2018-07-20 16:45:42 +01:00			`)`

			`@property`
			`def _is_cached(self):`
			`if os.path.exists(self.cfile):`
			`mtime = os.path.getmtime(self.fpath)`
			`ctime = os.path.getmtime(self.cfile)`
			`if ctime >= mtime:`
			`return True`
			`return False`

			`def _read(self):`
			`if not self._is_cached:`
- moved Tips from footer to bottom of each singular with an "encourage creation" header - google vision api binding for images (not in use yet) - konami code css preparations - experimental gallery css - minor code cleanups 2018-12-01 10:43:13 +00:00			`self._call_tool()`
v4.0a 2018-07-20 16:45:42 +01:00			`self._cache_update()`
			`else:`
			`self._cache_read()`

			`def _cache_update(self):`
			`with open(self.cfile, 'wt') as f:`
re-adding immediate async processing; is_page added to hide a few footer metatada in page type posts; unused symbols removed, but assets updated; footer extended with a lot of extra information; kcl action replaced; 2018-12-27 19:48:06 +00:00			`logging.debug(`
			`"writing cached meta file of %s to %s",`
			`self.fpath,`
			`self.cfile`
			`)`
v4.0a 2018-07-20 16:45:42 +01:00			`f.write(json.dumps(self, indent=4, sort_keys=True))`

			`def _cache_read(self):`
			`with open(self.cfile, 'rt') as f:`
			`data = json.loads(f.read())`
			`for k, v in data.items():`
- moved Tips from footer to bottom of each singular with an "encourage creation" header - google vision api binding for images (not in use yet) - konami code css preparations - experimental gallery css - minor code cleanups 2018-12-01 10:43:13 +00:00			`self[k] = v`

- now checking images against google vision api (why google, despite my despise of google, the company): vision api is the only one which is simple enough to use and their labelling is reasonable. - checking text against google natural language api: the strict classification it offers is better, than free folksonomy, if I ever want to connect entries based on topic unfortunately they don't support Hungarian yet. 2018-12-11 14:06:18 +00:00			`class GoogleClassifyText(CachedMeta):`
			`def __init__(self, fpath, txt, lang='en'):`
			`self.fpath = fpath`
			`self.txt = txt`
			`self.lang = lang`
			`self._read()`

			`def _call_tool(self):`
			`params = {`
			`"document": {`
			`"type": "PLAIN_TEXT",`
			`"content": self.txt,`
			`"language": self.lang,`
			`}`
			`}`

			`url = "https://language.googleapis.com/v1beta2/documents:classifyText?key=%s" % (`
			`keys.gcloud.get('key')`
			`)`
re-adding immediate async processing; is_page added to hide a few footer metatada in page type posts; unused symbols removed, but assets updated; footer extended with a lot of extra information; kcl action replaced; 2018-12-27 19:48:06 +00:00			`logging.info(`
			`"calling Google classifyText for %s",`
			`self.fpath`
			`)`
- now checking images against google vision api (why google, despite my despise of google, the company): vision api is the only one which is simple enough to use and their labelling is reasonable. - checking text against google natural language api: the strict classification it offers is better, than free folksonomy, if I ever want to connect entries based on topic unfortunately they don't support Hungarian yet. 2018-12-11 14:06:18 +00:00			`r = requests.post(url, json=params)`
			`try:`
			`resp = r.json()`
			`for cat in resp.get('categories', []):`
			`self[cat.get('name')] = cat.get('confidence')`
			`except Exception as e:`
			`logging.error(`
			`'failed to call Google Vision API on: %s, reason: %s',`
			`self.fpath,`
			`e`
			`)`

- moved Tips from footer to bottom of each singular with an "encourage creation" header - google vision api binding for images (not in use yet) - konami code css preparations - experimental gallery css - minor code cleanups 2018-12-01 10:43:13 +00:00			`class GoogleVision(CachedMeta):`
			`def __init__(self, fpath, imgurl):`
			`self.fpath = fpath`
			`self.imgurl = imgurl`
			`self._read()`

			`@property`
- now checking images against google vision api (why google, despite my despise of google, the company): vision api is the only one which is simple enough to use and their labelling is reasonable. - checking text against google natural language api: the strict classification it offers is better, than free folksonomy, if I ever want to connect entries based on topic unfortunately they don't support Hungarian yet. 2018-12-11 14:06:18 +00:00			`def response(self):`
			`if 'responses' not in self:`
			`return {}`
			`if not len(self['responses']):`
			`return {}`
			`if 'labelAnnotations' not in self['responses'][0]:`
			`return {}`
			`return self['responses'][0]`
- moved Tips from footer to bottom of each singular with an "encourage creation" header - google vision api binding for images (not in use yet) - konami code css preparations - experimental gallery css - minor code cleanups 2018-12-01 10:43:13 +00:00
- now checking images against google vision api (why google, despite my despise of google, the company): vision api is the only one which is simple enough to use and their labelling is reasonable. - checking text against google natural language api: the strict classification it offers is better, than free folksonomy, if I ever want to connect entries based on topic unfortunately they don't support Hungarian yet. 2018-12-11 14:06:18 +00:00			`@property`
			`def tags(self):`
			`tags = []`
- moved Tips from footer to bottom of each singular with an "encourage creation" header - google vision api binding for images (not in use yet) - konami code css preparations - experimental gallery css - minor code cleanups 2018-12-01 10:43:13 +00:00
- now checking images against google vision api (why google, despite my despise of google, the company): vision api is the only one which is simple enough to use and their labelling is reasonable. - checking text against google natural language api: the strict classification it offers is better, than free folksonomy, if I ever want to connect entries based on topic unfortunately they don't support Hungarian yet. 2018-12-11 14:06:18 +00:00			`if 'labelAnnotations' in self.response:`
			`for label in self.response['labelAnnotations']:`
			`tags.append(label['description'])`

			`if 'webDetection' in self.response:`
			`if 'webEntities' in self.response['webDetection']:`
			`for label in self.response['webDetection']['webEntities']:`
			`tags.append(label['description'])`
			`return tags`

			`@property`
			`def landmark(self):`
			`landmark = None`
			`if 'landmarkAnnotations' in self.response:`
			`if len(self.response['landmarkAnnotations']):`
			`match = self.response['landmarkAnnotations'].pop()`
			`landmark = {`
			`'name': match['description'],`
			`'latitude': match['locations'][0]['latLng']['latitude'],`
			`'longitude': match['locations'][0]['latLng']['longitude']`
- moved Tips from footer to bottom of each singular with an "encourage creation" header - google vision api binding for images (not in use yet) - konami code css preparations - experimental gallery css - minor code cleanups 2018-12-01 10:43:13 +00:00			`}`
- now checking images against google vision api (why google, despite my despise of google, the company): vision api is the only one which is simple enough to use and their labelling is reasonable. - checking text against google natural language api: the strict classification it offers is better, than free folksonomy, if I ever want to connect entries based on topic unfortunately they don't support Hungarian yet. 2018-12-11 14:06:18 +00:00			`return landmark`

			`@property`
			`def onlinecopies(self):`
			`copies = []`
			`if 'webDetection' in self.response:`
			`if 'pagesWithMatchingImages' in self.response['webDetection']:`
			`for match in self.response['webDetection']['pagesWithMatchingImages']:`
			`copies.append(match['url'])`
			`return copies`

			`def _call_tool(self):`
			`params = {`
			`"requests": [{`
			`"image": {"source": {"imageUri": self.imgurl}},`
			`"features": [`
			`{`
			`"type": "LANDMARK_DETECTION",`
			`},`
			`{`
			`"type": "WEB_DETECTION",`
			`},`
			`{`
			`"type": "LABEL_DETECTION",`
			`}`
			`]`
			`}]`
- moved Tips from footer to bottom of each singular with an "encourage creation" header - google vision api binding for images (not in use yet) - konami code css preparations - experimental gallery css - minor code cleanups 2018-12-01 10:43:13 +00:00			`}`

- now checking images against google vision api (why google, despite my despise of google, the company): vision api is the only one which is simple enough to use and their labelling is reasonable. - checking text against google natural language api: the strict classification it offers is better, than free folksonomy, if I ever want to connect entries based on topic unfortunately they don't support Hungarian yet. 2018-12-11 14:06:18 +00:00			`url = "https://vision.googleapis.com/v1/images:annotate?key=%s" % (`
			`keys.gcloud.get('key')`
			`)`
re-adding immediate async processing; is_page added to hide a few footer metatada in page type posts; unused symbols removed, but assets updated; footer extended with a lot of extra information; kcl action replaced; 2018-12-27 19:48:06 +00:00			`logging.info(`
			`"calling Google Vision for %s",`
			`self.fpath`
			`)`
- moved Tips from footer to bottom of each singular with an "encourage creation" header - google vision api binding for images (not in use yet) - konami code css preparations - experimental gallery css - minor code cleanups 2018-12-01 10:43:13 +00:00			`r = requests.post(url, json=params)`
			`try:`
			`resp = r.json()`
			`for k, v in resp.items():`
			`self[k] = v`
			`except Exception as e:`
- now checking images against google vision api (why google, despite my despise of google, the company): vision api is the only one which is simple enough to use and their labelling is reasonable. - checking text against google natural language api: the strict classification it offers is better, than free folksonomy, if I ever want to connect entries based on topic unfortunately they don't support Hungarian yet. 2018-12-11 14:06:18 +00:00			`logging.error(`
			`'failed to call Google Vision API on: %s, reason: %s',`
			`self.fpath,`
			`e`
			`)`
- moved Tips from footer to bottom of each singular with an "encourage creation" header - google vision api binding for images (not in use yet) - konami code css preparations - experimental gallery css - minor code cleanups 2018-12-01 10:43:13 +00:00
			`class Exif(CachedMeta):`
			`def __init__(self, fpath):`
			`self.fpath = fpath`
			`self._read()`
v4.0a 2018-07-20 16:45:42 +01:00
- moved Tips from footer to bottom of each singular with an "encourage creation" header - google vision api binding for images (not in use yet) - konami code css preparations - experimental gallery css - minor code cleanups 2018-12-01 10:43:13 +00:00			`def _call_tool(self):`
v4.0a 2018-07-20 16:45:42 +01:00			`"""`
			`Why like this: the # on some of the params forces exiftool to`
			`display values like decimals, so the latitude / longitude params`
			`can be used and parsed in a sane way`

			`If only -json is passed, it gets everything nicely, but in the default`
			`format, which would require another round to parse`

			`"""`
			`cmd = (`
			`"exiftool",`
			`'-sort',`
			`'-json',`
			`'-MIMEType',`
			`'-FileType',`
			`'-FileName',`
			`'-FileSize#',`
			`'-ModifyDate',`
			`'-CreateDate',`
			`'-DateTimeOriginal',`
			`'-ImageHeight',`
			`'-ImageWidth',`
			`'-Aperture',`
			`'-FOV',`
			`'-ISO',`
			`'-FocalLength',`
			`'-FNumber',`
			`'-FocalLengthIn35mmFormat',`
			`'-ExposureTime',`
			`'-Model',`
			`'-GPSLongitude#',`
			`'-GPSLatitude#',`
			`'-LensID',`
			`'-LensSpec',`
			`'-Lens',`
			`'-ReleaseDate',`
			`'-Description',`
			`'-Headline',`
			`'-HierarchicalSubject',`
			`'-Copyright',`
			`'-Artist',`
			`self.fpath`
			`)`

			`p = subprocess.Popen(`
			`cmd,`
			`stdin=subprocess.PIPE,`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE,`
			`)`

			`stdout, stderr = p.communicate()`
			`if stderr:`
			`raise OSError("Error reading EXIF:\n\t%s\n\t%s", cmd, stderr)`

			`exif = json.loads(stdout.decode('utf-8').strip()).pop()`
			`if 'ReleaseDate' in exif and 'ReleaseTime' in exif:`
			`exif['DateTimeRelease'] = "%s %s" % (`
			`exif.get('ReleaseDate'), exif.get('ReleaseTime')[:8]`
			`)`
			`del(exif['ReleaseDate'])`
			`del(exif['ReleaseTime'])`

			`for k, v in exif.items():`
			`self[k] = self.exifdate2rfc(v)`

			`def exifdate2rfc(self, value):`
			`""" converts and EXIF date string to RFC 3339 format`

			`:param value: EXIF date (2016:05:01 00:08:24)`
			`:type arg1: str`
			`:return: RFC 3339 string with UTC timezone 2016-05-01T00:08:24+00:00`
			`:rtype: str`
			`"""`
			`if not isinstance(value, str):`
			`return value`
			`match = EXIFDATE.match(value)`
			`if not match:`
			`return value`
			`return "%s-%s-%sT%s+00:00" % (`
			`match.group('year'),`
			`match.group('month'),`
			`match.group('day'),`
			`match.group('time')`
			`)`