petermolnar's repositories — nasg: c6bae2837bcf1208af1032658ff67b9d671d0b04

better indexer which only updates a document if it's mtime has changed

Peter Molnar hello@petermolnar.net

Fri, 26 May 2017 14:52:30 +0100

commit

c6bae2837bcf1208af1032658ff67b9d671d0b04

parent

558195288d9d6c5af4bf116c64e6537b01a268c8

3 files changed, 110 insertions(+), 29 deletions(-)

jump to

nasg.py

search.py

shared.py

M nasg.py → nasg.py

@@ -26,6 +26,7 @@ import langdetect
 import requests
 from breadability.readable import Article
 from whoosh import index
+from whoosh import qparser
 import jinja2
 import urllib.parse
 import shared
@@ -38,27 +39,94 @@ parts.insert(0,tail)
         (path,tail) = os.path.split(path)
     return parts
 
-class Indexer(object):
+#class Indexer(object):
+
+    #def __init__(self):
+        #self.tmp = tempfile.mkdtemp(
+            #'whooshdb_',
+            #dir=tempfile.gettempdir()
+        #)
+        #atexit.register(
+            #shutil.rmtree,
+            #os.path.abspath(self.tmp)
+        #)
+        #self.ix = index.create_in(self.tmp, shared.schema)
+        #self.target = os.path.abspath(os.path.join(
+            #shared.config.get('target', 'builddir'),
+            #shared.config.get('var', 'searchdb')
+        #))
+        #self.writer = self.ix.writer()
+
+
+    #async def append(self, singular):
+        #logging.info("appending search index with %s", singular.fname)
+
+        #content_real = [
+            #singular.fname,
+            #singular.summary,
+            #singular.content,
+        #]
+
+        #content_remote = []
+        #for url, offlinecopy in singular.offlinecopies.items():
+            #content_remote.append("%s" % offlinecopy)
+
+        #weight = 1
+        #if singular.isbookmark:
+            #weight = 10
+        #if singular.ispage:
+            #weight = 100
+
+        #self.writer.add_document(
+            #title=singular.title,
+            #url=singular.url,
+            #content=" ".join(list(map(str,[*content_real, *content_remote]))),
+            #date=singular.published.datetime,
+            #tags=",".join(list(map(str, singular.tags))),
+            #weight=weight,
+            #img="%s" % singular.photo
+        #)
+
+    #def finish(self):
+        #self.writer.commit()
+        #if os.path.isdir(self.target):
+            #shutil.rmtree(self.target)
+        #shutil.copytree(self.tmp, self.target)
+
+
+class SmartIndexer(object):
 
     def __init__(self):
-        self.tmp = tempfile.mkdtemp(
-            'whooshdb_',
-            dir=tempfile.gettempdir()
-        )
-        atexit.register(
-            shutil.rmtree,
-            os.path.abspath(self.tmp)
-        )
-        self.ix = index.create_in(self.tmp, shared.schema)
         self.target = os.path.abspath(os.path.join(
             shared.config.get('target', 'builddir'),
             shared.config.get('var', 'searchdb')
         ))
+        if not os.path.isdir(self.target):
+            os.mkdir(self.target)
+
+        if index.exists_in(self.target):
+            self.ix = index.open_dir(self.target)
+        else:
+            self.ix = index.create_in(self.target, shared.schema)
         self.writer = self.ix.writer()
+        self.qp = qparser.QueryParser("url", schema=shared.schema)
 
+    async def append(self, singular):
+        logging.debug("searching for existing index for %s", singular.fname)
+        exists = False
 
-    async def append(self, singular):
-        logging.info("appending search index with %s", singular.fname)
+        q = self.qp.parse(singular.url)
+        r = self.ix.searcher().search(q, limit=1)
+        if r:
+            r = r[0]
+            # nothing to do, the entry is present and is up to date
+            ixtime = r['mtime']
+            if  int(ixtime) == int(singular.mtime):
+                logging.info("search index is up to date for %s", singular.fname)
+                return
+            else:
+                logging.info("search index is out of date: %d (indexed) vs %d", ixtime, singular.mtime)
+                exists = True
 
         content_real = [
             singular.fname,
@@ -76,21 +144,33 @@ weight = 10
         if singular.ispage:
             weight = 100
 
-        self.writer.add_document(
-            title=singular.title,
-            url=singular.url,
-            content=" ".join(list(map(str,[*content_real, *content_remote]))),
-            date=singular.published.datetime,
-            tags=",".join(list(map(str, singular.tags))),
-            weight=weight,
-            img="%s" % singular.photo
-        )
+        if exists:
+            logging.info("updating search index with %s", singular.fname)
+            self.writer.add_document(
+                title=singular.title,
+                url=singular.url,
+                content=" ".join(list(map(str,[*content_real, *content_remote]))),
+                date=singular.published.datetime,
+                tags=",".join(list(map(str, singular.tags))),
+                weight=weight,
+                img="%s" % singular.photo,
+                mtime=singular.mtime,
+            )
+        else:
+            logging.info("appending search index with %s", singular.fname)
+            self.writer.update_document(
+                title=singular.title,
+                url=singular.url,
+                content=" ".join(list(map(str,[*content_real, *content_remote]))),
+                date=singular.published.datetime,
+                tags=",".join(list(map(str, singular.tags))),
+                weight=weight,
+                img="%s" % singular.photo,
+                mtime=singular.mtime
+            )
 
     def finish(self):
         self.writer.commit()
-        if os.path.isdir(self.target):
-            shutil.rmtree(self.target)
-        shutil.copytree(self.tmp, self.target)
 
 class OfflineCopy(object):
     def __init__(self, url):
@@ -160,7 +240,7 @@
         doc = Article(r.text, url=self.url)
         self.fm.metadata['title'] = doc._original_document.title
         self.fm.metadata['realurl'] = r.url
-        self.fm.content = Pandoc(False).convert(doc.readable)
+        self.fm.content = shared.Pandoc(False).convert(doc.readable)
         self.write()
 
 
@@ -1046,7 +1126,7 @@ return 'singular.html'
 
     @property
     def html(self):
-        return Pandoc().convert(self.content)
+        return shared.Pandoc().convert(self.content)
 
     @property
     def offlinecopies(self):
@@ -1330,7 +1410,7 @@ logging.debug("copying %s to %s", s, d)
             shutil.copy2(s, d)
 
         logging.info("pouplating searchdb")
-        searchdb = Indexer()
+        searchdb = SmartIndexer()
         loop.run_until_complete(self.__aindex(content, searchdb))
         searchdb.finish()

M search.py → search.py

@@ -9,8 +9,6 @@ import sanic.response
 from sanic.log import log as logging
 from whoosh import index
 from whoosh import qparser
-from whoosh import fields
-from whoosh import analysis
 import jinja2
 import shared

M shared.py → shared.py

@@ -70,6 +70,9 @@ sortable=True
     ),
     img=fields.TEXT(
         stored=True
+    ),
+    mtime=fields.NUMERIC(
+        stored=True
     )
 )