better indexer which only updates a document if it's mtime has changed

2017-05-26 14:52:30 +01:00 · 2017-05-26 14:52:30 +01:00 · c6bae2837b
commit c6bae2837b
parent 558195288d
3 changed files with 110 additions and 29 deletions
--- a/nasg.py
+++ b/nasg.py
@ -26,6 +26,7 @@ import langdetect
 import requests
 from breadability.readable import Article
 from whoosh import index
+from whoosh import qparser
 import jinja2
 import urllib.parse
 import shared
@ -38,27 +39,94 @@ def splitpath(path):
        (path,tail) = os.path.split(path)
    return parts

-class Indexer(object):
+#class Indexer(object):
+
+    #def __init__(self):
+        #self.tmp = tempfile.mkdtemp(
+            #'whooshdb_',
+            #dir=tempfile.gettempdir()
+        #)
+        #atexit.register(
+            #shutil.rmtree,
+            #os.path.abspath(self.tmp)
+        #)
+        #self.ix = index.create_in(self.tmp, shared.schema)
+        #self.target = os.path.abspath(os.path.join(
+            #shared.config.get('target', 'builddir'),
+            #shared.config.get('var', 'searchdb')
+        #))
+        #self.writer = self.ix.writer()
+
+
+    #async def append(self, singular):
+        #logging.info("appending search index with %s", singular.fname)
+
+        #content_real = [
+            #singular.fname,
+            #singular.summary,
+            #singular.content,
+        #]
+
+        #content_remote = []
+        #for url, offlinecopy in singular.offlinecopies.items():
+            #content_remote.append("%s" % offlinecopy)
+
+        #weight = 1
+        #if singular.isbookmark:
+            #weight = 10
+        #if singular.ispage:
+            #weight = 100
+
+        #self.writer.add_document(
+            #title=singular.title,
+            #url=singular.url,
+            #content=" ".join(list(map(str,[*content_real, *content_remote]))),
+            #date=singular.published.datetime,
+            #tags=",".join(list(map(str, singular.tags))),
+            #weight=weight,
+            #img="%s" % singular.photo
+        #)
+
+    #def finish(self):
+        #self.writer.commit()
+        #if os.path.isdir(self.target):
+            #shutil.rmtree(self.target)
+        #shutil.copytree(self.tmp, self.target)
+
+
+class SmartIndexer(object):

    def __init__(self):
-        self.tmp = tempfile.mkdtemp(
-            'whooshdb_',
-            dir=tempfile.gettempdir()
-        )
-        atexit.register(
-            shutil.rmtree,
-            os.path.abspath(self.tmp)
-        )
-        self.ix = index.create_in(self.tmp, shared.schema)
        self.target = os.path.abspath(os.path.join(
            shared.config.get('target', 'builddir'),
            shared.config.get('var', 'searchdb')
        ))
-        self.writer = self.ix.writer()
+        if not os.path.isdir(self.target):
+            os.mkdir(self.target)

+        if index.exists_in(self.target):
+            self.ix = index.open_dir(self.target)
+        else:
+            self.ix = index.create_in(self.target, shared.schema)
+        self.writer = self.ix.writer()
+        self.qp = qparser.QueryParser("url", schema=shared.schema)

    async def append(self, singular):
-        logging.info("appending search index with %s", singular.fname)
+        logging.debug("searching for existing index for %s", singular.fname)
+        exists = False
+
+        q = self.qp.parse(singular.url)
+        r = self.ix.searcher().search(q, limit=1)
+        if r:
+            r = r[0]
+            # nothing to do, the entry is present and is up to date
+            ixtime = r['mtime']
+            if  int(ixtime) == int(singular.mtime):
+                logging.info("search index is up to date for %s", singular.fname)
+                return
+            else:
+                logging.info("search index is out of date: %d (indexed) vs %d", ixtime, singular.mtime)
+                exists = True

        content_real = [
            singular.fname,
@ -76,21 +144,33 @@ class Indexer(object):
        if singular.ispage:
            weight = 100

-        self.writer.add_document(
-            title=singular.title,
-            url=singular.url,
-            content=" ".join(list(map(str,[*content_real, *content_remote]))),
-            date=singular.published.datetime,
-            tags=",".join(list(map(str, singular.tags))),
-            weight=weight,
-            img="%s" % singular.photo
-        )
+        if exists:
+            logging.info("updating search index with %s", singular.fname)
+            self.writer.add_document(
+                title=singular.title,
+                url=singular.url,
+                content=" ".join(list(map(str,[*content_real, *content_remote]))),
+                date=singular.published.datetime,
+                tags=",".join(list(map(str, singular.tags))),
+                weight=weight,
+                img="%s" % singular.photo,
+                mtime=singular.mtime,
+            )
+        else:
+            logging.info("appending search index with %s", singular.fname)
+            self.writer.update_document(
+                title=singular.title,
+                url=singular.url,
+                content=" ".join(list(map(str,[*content_real, *content_remote]))),
+                date=singular.published.datetime,
+                tags=",".join(list(map(str, singular.tags))),
+                weight=weight,
+                img="%s" % singular.photo,
+                mtime=singular.mtime
+            )

    def finish(self):
        self.writer.commit()
-        if os.path.isdir(self.target):
-            shutil.rmtree(self.target)
-        shutil.copytree(self.tmp, self.target)

 class OfflineCopy(object):
    def __init__(self, url):
@ -160,7 +240,7 @@ class OfflineCopy(object):
        doc = Article(r.text, url=self.url)
        self.fm.metadata['title'] = doc._original_document.title
        self.fm.metadata['realurl'] = r.url
-        self.fm.content = Pandoc(False).convert(doc.readable)
+        self.fm.content = shared.Pandoc(False).convert(doc.readable)
        self.write()


@ -1046,7 +1126,7 @@ class Singular(object):

    @property
    def html(self):
-        return Pandoc().convert(self.content)
+        return shared.Pandoc().convert(self.content)

    @property
    def offlinecopies(self):
@ -1330,7 +1410,7 @@ class NASG(object):
            shutil.copy2(s, d)

        logging.info("pouplating searchdb")
-        searchdb = Indexer()
+        searchdb = SmartIndexer()
        loop.run_until_complete(self.__aindex(content, searchdb))
        searchdb.finish()

--- a/search.py
+++ b/search.py
@ -9,8 +9,6 @@ import sanic.response
 from sanic.log import log as logging
 from whoosh import index
 from whoosh import qparser
-from whoosh import fields
-from whoosh import analysis
 import jinja2
 import shared

--- a/shared.py
+++ b/shared.py
@ -70,6 +70,9 @@ schema = fields.Schema(
    ),
    img=fields.TEXT(
        stored=True
+    ),
+    mtime=fields.NUMERIC(
+        stored=True
    )
 )