petermolnar's repositories — nasg: cfe0112f70dd815a76e9121fbd99feff7ad92e12

better search and webmention sending added

Peter Molnar hello@petermolnar.eu

Wed, 31 May 2017 13:53:47 +0100

commit

cfe0112f70dd815a76e9121fbd99feff7ad92e12

parent

c6bae2837bcf1208af1032658ff67b9d671d0b04

3 files changed, 146 insertions(+), 180 deletions(-)

jump to

nasg.py

shared.py

webmention.py

M nasg.py → nasg.py

@@ -15,6 +15,7 @@ import hashlib
 import math
 import asyncio
 import csv
+import operator
 
 import magic
 import arrow
@@ -30,6 +31,9 @@ from whoosh import qparser
 import jinja2
 import urllib.parse
 import shared
+from webmentiontools.send import WebmentionSend
+
+import time
 
 def splitpath(path):
     parts = []
@@ -39,60 +43,6 @@ parts.insert(0,tail)
         (path,tail) = os.path.split(path)
     return parts
 
-#class Indexer(object):
-
-    #def __init__(self):
-        #self.tmp = tempfile.mkdtemp(
-            #'whooshdb_',
-            #dir=tempfile.gettempdir()
-        #)
-        #atexit.register(
-            #shutil.rmtree,
-            #os.path.abspath(self.tmp)
-        #)
-        #self.ix = index.create_in(self.tmp, shared.schema)
-        #self.target = os.path.abspath(os.path.join(
-            #shared.config.get('target', 'builddir'),
-            #shared.config.get('var', 'searchdb')
-        #))
-        #self.writer = self.ix.writer()
-
-
-    #async def append(self, singular):
-        #logging.info("appending search index with %s", singular.fname)
-
-        #content_real = [
-            #singular.fname,
-            #singular.summary,
-            #singular.content,
-        #]
-
-        #content_remote = []
-        #for url, offlinecopy in singular.offlinecopies.items():
-            #content_remote.append("%s" % offlinecopy)
-
-        #weight = 1
-        #if singular.isbookmark:
-            #weight = 10
-        #if singular.ispage:
-            #weight = 100
-
-        #self.writer.add_document(
-            #title=singular.title,
-            #url=singular.url,
-            #content=" ".join(list(map(str,[*content_real, *content_remote]))),
-            #date=singular.published.datetime,
-            #tags=",".join(list(map(str, singular.tags))),
-            #weight=weight,
-            #img="%s" % singular.photo
-        #)
-
-    #def finish(self):
-        #self.writer.commit()
-        #if os.path.isdir(self.target):
-            #shutil.rmtree(self.target)
-        #shutil.copytree(self.tmp, self.target)
-
 
 class SmartIndexer(object):
 
@@ -175,8 +125,7 @@
 class OfflineCopy(object):
     def __init__(self, url):
         self.url = url
-        h = url.encode('utf-8')
-        self.fname = hashlib.sha1(h).hexdigest()
+        self.fname = hashlib.sha1(url.encode('utf-8')).hexdigest()
         self.targetdir = os.path.abspath(
             shared.config.get('source', 'offlinecopiesdir')
         )
@@ -283,7 +232,14 @@ self.data = {}
 
     def append(self, key, value):
         if key in self.data:
-            logging.error("duplicate key: %s", key)
+            logging.warning("duplicate key: %s, using existing instead", key)
+            existing = self.data.get(key)
+            if hasattr(value, 'fname') and hasattr(existing, 'fname'):
+                logging.warning(
+                    "%s collides with existing %s",
+                    value.fname,
+                    existing.fname
+                )
             return
         self.data[key] = value
 
@@ -305,95 +261,6 @@ for k, v in self.data.items():
             yield (k, v)
         return
 
-#class CMDLine(object):
-    #def __init__(self, executable):
-        #self.executable = self._which(executable)
-        #if self.executable is None:
-            #raise OSError('No %s found in PATH!' % executable)
-            #return
-
-    #@staticmethod
-    #def _which(name):
-        #for d in os.environ['PATH'].split(':'):
-            #which = glob.glob(os.path.join(d, name), recursive=True)
-            #if which:
-                #return which.pop()
-        #return None
-
-    #def __enter__(self):
-        #self.process = subprocess.Popen(
-            #[self.executable, "-stay_open", "True",  "-@", "-"],
-            #universal_newlines=True,
-            #stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-        #return self
-
-    #def  __exit__(self, exc_type, exc_value, traceback):
-        #self.process.stdin.write("-stay_open\nFalse\n")
-        #self.process.stdin.flush()
-
-    #def execute(self, *args):
-        #args = args + ("-execute\n",)
-        #self.process.stdin.write(str.join("\n", args))
-        #self.process.stdin.flush()
-        #output = ""
-        #fd = self.process.stdout.fileno()
-        #while not output.endswith(self.sentinel):
-            #output += os.read(fd, 4096).decode('utf-8', errors='ignore')
-        #return output[:-len(self.sentinel)]
-
-
-#class Pandoc(CMDLine):
-    #""" Handles calling external binary `exiftool` in an efficient way """
-    #def __init__(self, md2html=True):
-        #super().__init__('pandoc')
-        #if md2html:
-            #self.i = "markdown+" + "+".join([
-                #'backtick_code_blocks',
-                #'auto_identifiers',
-                #'fenced_code_attributes',
-                #'definition_lists',
-                #'grid_tables',
-                #'pipe_tables',
-                #'strikeout',
-                #'superscript',
-                #'subscript',
-                #'markdown_in_html_blocks',
-                #'shortcut_reference_links',
-                #'autolink_bare_uris',
-                #'raw_html',
-                #'link_attributes',
-                #'header_attributes',
-                #'footnotes',
-            #])
-            #self.o = 'html5'
-        #else:
-            #self.o = "markdown-" + "-".join([
-                #'raw_html',
-                #'native_divs',
-                #'native_spans',
-            #])
-            #self.i = 'html'
-
-    #def convert(self, text):
-        #cmd = (
-            #self.executable,
-            #'-o-',
-            #'--from=%s' % self.i,
-            #'--to=%s' % self.o
-        #)
-        #logging.debug('converting content with Pandoc')
-        #p = subprocess.Popen(
-            #cmd,
-            #stdin=subprocess.PIPE,
-            #stdout=subprocess.PIPE,
-            #stderr=subprocess.PIPE,
-        #)
-
-        #stdout, stderr = p.communicate(input=text.encode())
-        #if stderr:
-            #logging.error("Error during pandoc covert:\n\t%s\n\t%s", cmd, stderr)
-        #return stdout.decode('utf-8').strip()
-
 # based on http://stackoverflow.com/a/10075210
 class ExifTool(shared.CMDLine):
     """ Handles calling external binary `exiftool` in an efficient way """
@@ -403,7 +270,32 @@ def __init__(self):
         super().__init__('exiftool')
 
     def get_metadata(self, *filenames):
-        return json.loads(self.execute('-sort', '-json', '-MIMEType', '-FileType', '-FileName', '-ModifyDate', '-CreateDate', '-DateTimeOriginal', '-ImageHeight', '-ImageWidth', '-Aperture', '-FOV', '-ISO', '-FocalLength', '-FNumber', '-FocalLengthIn35mmFormat', '-ExposureTime', '-Copyright', '-Artist', '-Model', '-GPSLongitude#', '-GPSLatitude#', '-LensID', *filenames))
+        return json.loads(self.execute(
+            '-sort',
+            #'-quiet',
+            '-json',
+            '-MIMEType',
+            '-FileType',
+            '-FileName',
+            '-ModifyDate',
+            '-CreateDate',
+            '-DateTimeOriginal',
+            '-ImageHeight',
+            '-ImageWidth',
+            '-Aperture',
+            '-FOV',
+            '-ISO',
+            '-FocalLength',
+            '-FNumber',
+            '-FocalLengthIn35mmFormat',
+            '-ExposureTime',
+            '-Copyright',
+            '-Artist',
+            '-Model',
+            '-GPSLongitude#',
+            '-GPSLatitude#',
+            '-LensID',
+            *filenames))
 
 class Images(BaseIter):
     def __init__(self, extensions=['jpg', 'gif', 'png']):
@@ -477,6 +369,7 @@ self.alttext = ''
         self.sizes = []
         self.fallbacksize = int(shared.config.get('common','fallbackimg', fallback='720'))
         self.cl = None
+        self.singleimage = False
 
         for size in shared.config.options('downsize'):
             sizeext = shared.config.get('downsize', size)
@@ -512,7 +405,11 @@ )
 
     def __str__(self):
         if self.is_downsizeable and not self.cl:
-            return '\n<figure class="photo"><a target="_blank" class="adaptive" href="%s"><img src="%s" class="adaptimg" alt="%s" /></a><figcaption class=\"caption\">%s%s</figcaption></figure>\n' % (
+            uphoto = ''
+            if self.singleimage:
+                uphoto = ' u-photo'
+            return '\n<figure class="photo"><a target="_blank" class="adaptive%s" href="%s"><img src="%s" class="adaptimg" alt="%s" /></a><figcaption class=\"caption\">%s%s</figcaption></figure>\n' % (
+                uphoto,
                 self.target,
                 self.fallback,
                 self.alttext,
@@ -913,6 +810,7 @@ for row in r:
                     gones.append(row[0])
 
         tmplvars = {
+            'site': renderer.sitevars,
             'redirects': redirects,
             'gones': gones
         }
@@ -939,6 +837,8 @@ self.fname, self.ext = os.path.splitext(os.path.basename(self.path))
         self.meta = {}
         self.content = ''
         self.photo = self.images.data.get("%s.jpg" % self.fname, None)
+        if self.photo:
+            self.photo.singleimage = True
         self.__parse()
 
     def __repr__(self):
@@ -949,7 +849,6 @@ with open(self.path, mode='rt') as f:
             self.meta, self.content = frontmatter.parse(f.read())
             self.__filter_images()
         if self.isphoto:
-            #self.photo.alttext = self.content
             self.content = "%s\n%s" % (
                 self.content,
                 self.photo
@@ -1033,7 +932,7 @@ for link in urls:
             domain = '{uri.netloc}'.format(uri=urllib.parse.urlparse(link))
             if domain in shared.config.get('site', 'domains'):
                 continue
-            if r.get(link, False):
+            if link in r:
                 continue
             r.append(link)
 
@@ -1275,6 +1174,45 @@ html.write(r)
             html.close()
             os.utime(target, (self.mtime, self.mtime))
 
+
+class Webmentioner(object):
+    def __init__(self):
+        self.dbpath = os.path.abspath(os.path.join(
+            shared.config.get('target', 'builddir'),
+            shared.config.get('var', 'webmentions')
+        ))
+
+        if os.path.isfile(self.dbpath):
+            with open(self.dbpath, 'rt') as f:
+                self.db = json.loads(f.read())
+        else:
+            self.db = {}
+
+    async def ping(self, singular, dry_run = False):
+        for target in singular.urls:
+            record = {
+                'mtime': singular.mtime,
+                'source': singular.url,
+                'target': target
+            }
+            h = json.dumps(record, sort_keys=True)
+            h = hashlib.sha1(h.encode('utf-8')).hexdigest()
+            if self.db.get(h, False):
+                logging.debug("%s is already pinged from %s @ %d, skipping",
+                    target, singular.url, singular.mtime)
+                continue
+
+            logging.info("sending webmention from %s to %s", singular, target)
+            if not dry_run:
+                ws = WebmentionSend(source, target)
+                await ws.send(allowredirect=True, timeout=30)
+            self.db[h] = record
+
+    def finish(self):
+        with open(self.dbpath, 'wt') as f:
+            f.write(json.dumps(self.db, sort_keys=True, indent=4))
+
+
 class NASG(object):
     def __init__(self):
         # --- set params
@@ -1299,7 +1237,7 @@ help='force rendering HTML'
         )
         parser.add_argument(
             '--loglevel',
-            default='info',
+            default='error',
             help='change loglevel'
         )
         parser.add_argument(
@@ -1358,6 +1296,10 @@ async def __aindex(self, content, searchdb):
         for (pubtime, singular) in content:
             await searchdb.append(singular)
 
+    async def __aping(self, content, pinger):
+        for (pubtime, singular) in content:
+            await pinger.ping(singular)
+
     def run(self):
 
         if shared.config.getboolean('params', 'clear'):
@@ -1375,7 +1317,10 @@
         logging.info("discovering images")
         images = Images()
         images.populate()
-        existing = glob.glob(os.path.join(shared.config.get('target', 'filesdir'), "*"))
+        existing = glob.glob(os.path.join(
+            shared.config.get('target', 'filesdir'),
+            "*"
+        ))
         if not shared.config.getboolean('params', 'nodownsize'):
             logging.info("downsizing images")
             loop.run_until_complete(self.__adownsize(images, existing))
@@ -1413,6 +1358,11 @@ logging.info("pouplating searchdb")
         searchdb = SmartIndexer()
         loop.run_until_complete(self.__aindex(content, searchdb))
         searchdb.finish()
+
+        logging.info("webmentioning urls")
+        pinger = Webmentioner()
+        loop.run_until_complete(self.__aping(content, pinger))
+        pinger.finish()
 
         loop.close()

M shared.py → shared.py

@@ -22,6 +22,7 @@ ))
     return config
 
 ARROWISO = 'YYYY-MM-DDTHH:mm:ssZ'
+STRFISO = '%Y-%m-%dT%H:%M:%S%z'
 
 URLREGEX = re.compile(
     r'\s+https?\:\/\/?[a-zA-Z0-9\.\/\?\:@\-_=#]+'
@@ -46,8 +47,7 @@ unique=True
     ),
     title=fields.TEXT(
         stored=True,
-        analyzer=analysis.FancyAnalyzer(
-        )
+        analyzer=analysis.FancyAnalyzer()
     ),
     date=fields.DATETIME(
         stored=True,
@@ -55,8 +55,7 @@ sortable=True
     ),
     content=fields.TEXT(
         stored=True,
-        analyzer=analysis.FancyAnalyzer(
-        )
+        analyzer=analysis.FancyAnalyzer()
     ),
     tags=fields.TEXT(
         stored=True,
@@ -120,7 +119,7 @@ output += os.read(fd, 4096).decode('utf-8', errors='ignore')
         return output[:-len(self.sentinel)]
 
 class Pandoc(CMDLine):
-    """ Handles calling external binary `exiftool` in an efficient way """
+    """ Pandoc command line call with piped in- and output """
     def __init__(self, md2html=True):
         super().__init__('pandoc')
         if md2html:
@@ -158,7 +157,7 @@ '-o-',
             '--from=%s' % self.i,
             '--to=%s' % self.o
         )
-        logging.debug('converting content with Pandoc')
+        logging.debug('converting string with Pandoc')
         p = subprocess.Popen(
             cmd,
             stdin=subprocess.PIPE,
@@ -168,5 +167,9 @@ )
 
         stdout, stderr = p.communicate(input=text.encode())
         if stderr:
-            logging.error("Error during pandoc covert:\n\t%s\n\t%s", cmd, stderr)
+            logging.error(
+                "Error during pandoc covert:\n\t%s\n\t%s",
+                cmd,
+                stderr
+            )
         return stdout.decode('utf-8').strip()

M webmention.py → webmention.py

@@ -32,8 +32,8 @@ if not self._validate():
             return
 
         self._parse()
-        self._save()
-        self._notify()
+        if self._save():
+            self._notify()
 
     def _validate(self):
         test = {
@@ -82,7 +82,6 @@ status=408
             )
             return False
 
-        self.source = self._source.realurl
         if not self._source.linksTo(self.target):
             self.r = sanic.response.text(
                 "'source' (%s) does not link to 'target' (%s)" % (
@@ -100,17 +99,34 @@ self.r = sanic.response.text(
                 "couldn't fetch 'target' from %s" % (self.target),
                 status=408
             )
-        self.target = self._target.realurl
         #logging.info("parsed webmention:\n%s\n\n%s", self.meta, self.content)
 
+    def _accepted(self):
+        self.r = sanic.response.text(
+            "accepted",
+            status=202
+        )
+
+
     def _save(self):
-        doc = frontmatter.loads('')
-        doc.metadata = self.meta
-        doc.content = self.content
         target = os.path.join(
             shared.config.get('source', 'commentsdir'),
             self.mhash
         )
+
+        if os.path.isfile(target):
+            with open(target) as f:
+                doc = frontmatter.loads(f.read())
+        else:
+            doc = frontmatter.loads('')
+
+        if self.content == doc.content:
+            logging.warning('repinged target, no update needed')
+            self._accepted()
+            return False
+
+        doc.metadata = self.meta
+        doc.content = self.content
         if os.path.isfile(target):
             logging.warning('updating existing webmention %s', target)
         else:
@@ -118,18 +134,17 @@ logging.warning('saving incoming webmention to %s', target)
 
         with open(target, 'wt') as t:
             t.write(frontmatter.dumps(doc))
-            self.r = sanic.response.text(
-                "accepted",
-                status=202
-            )
+            self._accepted()
+            return True
 
     def _notify(self):
-        text = "# webmention\n## Source\n\nauthor\n:    %s\n\nURL\n:    %s\n\nemail\n:    %s\n\ndate\n:    %s\n\n## Target\n\nURL\n:    %s\n\n---\n\n%s" % (
+        text = "\nsource URL\n:    %s\n\ntarget URL:\n:    %s\n\ndate\n:    %s\n\nauthor name:\n:    %s\n\nauthor URL:\n:    %s\n\nauthor email:\n:    %s\n\n---\n\n%s" % (
+            self.source,
+            self.target,
+            self._meta['date'],
             self._meta['author'].get('name', self.source),
             self._meta['author'].get('url', self.source),
             self._meta['author'].get('email', ''),
-            self._meta['date'],
-            self.target,
             self.content
         )
 
@@ -158,12 +173,13 @@ if hasattr(self, '_meta'):
             return self._meta
 
         self._meta = {
-            'author': self._source.author(),
-            'type': self._source.relationType(),
+            'author': self._source.author,
+            'type': self._source.relationType,
             'target': self.target,
             'source': self.source,
-            'date': arrow.get(self._source.pubDate()).format(shared.ARROWISO),
+            'date': arrow.get(self._source.pubDate).format(shared.ARROWISO),
         }
+
         return self._meta
 
     @property
@@ -171,10 +187,7 @@ def content(self):
         if hasattr(self, '_content'):
             return self._content
 
-        # from HTML to Markdown
-        self._content = shared.Pandoc(False).convert(self._source.content())
-        # from Markdown back to HTML
-        #self._content = shared.Pandoc().convert(tmpcontent)
+        self._content = shared.Pandoc(False).convert(self._source.content)
         return self._content