all repos — nasg @ c6bae2837bcf1208af1032658ff67b9d671d0b04

better indexer which only updates a document if it's mtime has changed
Peter Molnar hello@petermolnar.net
Fri, 26 May 2017 14:52:30 +0100
commit

c6bae2837bcf1208af1032658ff67b9d671d0b04

parent

558195288d9d6c5af4bf116c64e6537b01a268c8

3 files changed, 110 insertions(+), 29 deletions(-)

jump to
M nasg.pynasg.py

@@ -26,6 +26,7 @@ import langdetect

import requests from breadability.readable import Article from whoosh import index +from whoosh import qparser import jinja2 import urllib.parse import shared

@@ -38,27 +39,94 @@ parts.insert(0,tail)

(path,tail) = os.path.split(path) return parts -class Indexer(object): +#class Indexer(object): + + #def __init__(self): + #self.tmp = tempfile.mkdtemp( + #'whooshdb_', + #dir=tempfile.gettempdir() + #) + #atexit.register( + #shutil.rmtree, + #os.path.abspath(self.tmp) + #) + #self.ix = index.create_in(self.tmp, shared.schema) + #self.target = os.path.abspath(os.path.join( + #shared.config.get('target', 'builddir'), + #shared.config.get('var', 'searchdb') + #)) + #self.writer = self.ix.writer() + + + #async def append(self, singular): + #logging.info("appending search index with %s", singular.fname) + + #content_real = [ + #singular.fname, + #singular.summary, + #singular.content, + #] + + #content_remote = [] + #for url, offlinecopy in singular.offlinecopies.items(): + #content_remote.append("%s" % offlinecopy) + + #weight = 1 + #if singular.isbookmark: + #weight = 10 + #if singular.ispage: + #weight = 100 + + #self.writer.add_document( + #title=singular.title, + #url=singular.url, + #content=" ".join(list(map(str,[*content_real, *content_remote]))), + #date=singular.published.datetime, + #tags=",".join(list(map(str, singular.tags))), + #weight=weight, + #img="%s" % singular.photo + #) + + #def finish(self): + #self.writer.commit() + #if os.path.isdir(self.target): + #shutil.rmtree(self.target) + #shutil.copytree(self.tmp, self.target) + + +class SmartIndexer(object): def __init__(self): - self.tmp = tempfile.mkdtemp( - 'whooshdb_', - dir=tempfile.gettempdir() - ) - atexit.register( - shutil.rmtree, - os.path.abspath(self.tmp) - ) - self.ix = index.create_in(self.tmp, shared.schema) self.target = os.path.abspath(os.path.join( shared.config.get('target', 'builddir'), shared.config.get('var', 'searchdb') )) + if not os.path.isdir(self.target): + os.mkdir(self.target) + + if index.exists_in(self.target): + self.ix = index.open_dir(self.target) + else: + self.ix = index.create_in(self.target, shared.schema) self.writer = self.ix.writer() + self.qp = qparser.QueryParser("url", schema=shared.schema) + async def append(self, singular): + logging.debug("searching for existing index for %s", singular.fname) + exists = False - async def append(self, singular): - logging.info("appending search index with %s", singular.fname) + q = self.qp.parse(singular.url) + r = self.ix.searcher().search(q, limit=1) + if r: + r = r[0] + # nothing to do, the entry is present and is up to date + ixtime = r['mtime'] + if int(ixtime) == int(singular.mtime): + logging.info("search index is up to date for %s", singular.fname) + return + else: + logging.info("search index is out of date: %d (indexed) vs %d", ixtime, singular.mtime) + exists = True content_real = [ singular.fname,

@@ -76,21 +144,33 @@ weight = 10

if singular.ispage: weight = 100 - self.writer.add_document( - title=singular.title, - url=singular.url, - content=" ".join(list(map(str,[*content_real, *content_remote]))), - date=singular.published.datetime, - tags=",".join(list(map(str, singular.tags))), - weight=weight, - img="%s" % singular.photo - ) + if exists: + logging.info("updating search index with %s", singular.fname) + self.writer.add_document( + title=singular.title, + url=singular.url, + content=" ".join(list(map(str,[*content_real, *content_remote]))), + date=singular.published.datetime, + tags=",".join(list(map(str, singular.tags))), + weight=weight, + img="%s" % singular.photo, + mtime=singular.mtime, + ) + else: + logging.info("appending search index with %s", singular.fname) + self.writer.update_document( + title=singular.title, + url=singular.url, + content=" ".join(list(map(str,[*content_real, *content_remote]))), + date=singular.published.datetime, + tags=",".join(list(map(str, singular.tags))), + weight=weight, + img="%s" % singular.photo, + mtime=singular.mtime + ) def finish(self): self.writer.commit() - if os.path.isdir(self.target): - shutil.rmtree(self.target) - shutil.copytree(self.tmp, self.target) class OfflineCopy(object): def __init__(self, url):

@@ -160,7 +240,7 @@

doc = Article(r.text, url=self.url) self.fm.metadata['title'] = doc._original_document.title self.fm.metadata['realurl'] = r.url - self.fm.content = Pandoc(False).convert(doc.readable) + self.fm.content = shared.Pandoc(False).convert(doc.readable) self.write()

@@ -1046,7 +1126,7 @@ return 'singular.html'

@property def html(self): - return Pandoc().convert(self.content) + return shared.Pandoc().convert(self.content) @property def offlinecopies(self):

@@ -1330,7 +1410,7 @@ logging.debug("copying %s to %s", s, d)

shutil.copy2(s, d) logging.info("pouplating searchdb") - searchdb = Indexer() + searchdb = SmartIndexer() loop.run_until_complete(self.__aindex(content, searchdb)) searchdb.finish()
M search.pysearch.py

@@ -9,8 +9,6 @@ import sanic.response

from sanic.log import log as logging from whoosh import index from whoosh import qparser -from whoosh import fields -from whoosh import analysis import jinja2 import shared
M shared.pyshared.py

@@ -70,6 +70,9 @@ sortable=True

), img=fields.TEXT( stored=True + ), + mtime=fields.NUMERIC( + stored=True ) )