better indexer which only updates a document if it's mtime has changed

This commit is contained in:
Peter Molnar 2017-05-26 14:52:30 +01:00
parent 558195288d
commit c6bae2837b
3 changed files with 110 additions and 29 deletions

118
nasg.py
View file

@ -26,6 +26,7 @@ import langdetect
import requests import requests
from breadability.readable import Article from breadability.readable import Article
from whoosh import index from whoosh import index
from whoosh import qparser
import jinja2 import jinja2
import urllib.parse import urllib.parse
import shared import shared
@ -38,27 +39,94 @@ def splitpath(path):
(path,tail) = os.path.split(path) (path,tail) = os.path.split(path)
return parts return parts
class Indexer(object): #class Indexer(object):
#def __init__(self):
#self.tmp = tempfile.mkdtemp(
#'whooshdb_',
#dir=tempfile.gettempdir()
#)
#atexit.register(
#shutil.rmtree,
#os.path.abspath(self.tmp)
#)
#self.ix = index.create_in(self.tmp, shared.schema)
#self.target = os.path.abspath(os.path.join(
#shared.config.get('target', 'builddir'),
#shared.config.get('var', 'searchdb')
#))
#self.writer = self.ix.writer()
#async def append(self, singular):
#logging.info("appending search index with %s", singular.fname)
#content_real = [
#singular.fname,
#singular.summary,
#singular.content,
#]
#content_remote = []
#for url, offlinecopy in singular.offlinecopies.items():
#content_remote.append("%s" % offlinecopy)
#weight = 1
#if singular.isbookmark:
#weight = 10
#if singular.ispage:
#weight = 100
#self.writer.add_document(
#title=singular.title,
#url=singular.url,
#content=" ".join(list(map(str,[*content_real, *content_remote]))),
#date=singular.published.datetime,
#tags=",".join(list(map(str, singular.tags))),
#weight=weight,
#img="%s" % singular.photo
#)
#def finish(self):
#self.writer.commit()
#if os.path.isdir(self.target):
#shutil.rmtree(self.target)
#shutil.copytree(self.tmp, self.target)
class SmartIndexer(object):
def __init__(self): def __init__(self):
self.tmp = tempfile.mkdtemp(
'whooshdb_',
dir=tempfile.gettempdir()
)
atexit.register(
shutil.rmtree,
os.path.abspath(self.tmp)
)
self.ix = index.create_in(self.tmp, shared.schema)
self.target = os.path.abspath(os.path.join( self.target = os.path.abspath(os.path.join(
shared.config.get('target', 'builddir'), shared.config.get('target', 'builddir'),
shared.config.get('var', 'searchdb') shared.config.get('var', 'searchdb')
)) ))
self.writer = self.ix.writer() if not os.path.isdir(self.target):
os.mkdir(self.target)
if index.exists_in(self.target):
self.ix = index.open_dir(self.target)
else:
self.ix = index.create_in(self.target, shared.schema)
self.writer = self.ix.writer()
self.qp = qparser.QueryParser("url", schema=shared.schema)
async def append(self, singular): async def append(self, singular):
logging.info("appending search index with %s", singular.fname) logging.debug("searching for existing index for %s", singular.fname)
exists = False
q = self.qp.parse(singular.url)
r = self.ix.searcher().search(q, limit=1)
if r:
r = r[0]
# nothing to do, the entry is present and is up to date
ixtime = r['mtime']
if int(ixtime) == int(singular.mtime):
logging.info("search index is up to date for %s", singular.fname)
return
else:
logging.info("search index is out of date: %d (indexed) vs %d", ixtime, singular.mtime)
exists = True
content_real = [ content_real = [
singular.fname, singular.fname,
@ -76,6 +144,8 @@ class Indexer(object):
if singular.ispage: if singular.ispage:
weight = 100 weight = 100
if exists:
logging.info("updating search index with %s", singular.fname)
self.writer.add_document( self.writer.add_document(
title=singular.title, title=singular.title,
url=singular.url, url=singular.url,
@ -83,14 +153,24 @@ class Indexer(object):
date=singular.published.datetime, date=singular.published.datetime,
tags=",".join(list(map(str, singular.tags))), tags=",".join(list(map(str, singular.tags))),
weight=weight, weight=weight,
img="%s" % singular.photo img="%s" % singular.photo,
mtime=singular.mtime,
)
else:
logging.info("appending search index with %s", singular.fname)
self.writer.update_document(
title=singular.title,
url=singular.url,
content=" ".join(list(map(str,[*content_real, *content_remote]))),
date=singular.published.datetime,
tags=",".join(list(map(str, singular.tags))),
weight=weight,
img="%s" % singular.photo,
mtime=singular.mtime
) )
def finish(self): def finish(self):
self.writer.commit() self.writer.commit()
if os.path.isdir(self.target):
shutil.rmtree(self.target)
shutil.copytree(self.tmp, self.target)
class OfflineCopy(object): class OfflineCopy(object):
def __init__(self, url): def __init__(self, url):
@ -160,7 +240,7 @@ class OfflineCopy(object):
doc = Article(r.text, url=self.url) doc = Article(r.text, url=self.url)
self.fm.metadata['title'] = doc._original_document.title self.fm.metadata['title'] = doc._original_document.title
self.fm.metadata['realurl'] = r.url self.fm.metadata['realurl'] = r.url
self.fm.content = Pandoc(False).convert(doc.readable) self.fm.content = shared.Pandoc(False).convert(doc.readable)
self.write() self.write()
@ -1046,7 +1126,7 @@ class Singular(object):
@property @property
def html(self): def html(self):
return Pandoc().convert(self.content) return shared.Pandoc().convert(self.content)
@property @property
def offlinecopies(self): def offlinecopies(self):
@ -1330,7 +1410,7 @@ class NASG(object):
shutil.copy2(s, d) shutil.copy2(s, d)
logging.info("pouplating searchdb") logging.info("pouplating searchdb")
searchdb = Indexer() searchdb = SmartIndexer()
loop.run_until_complete(self.__aindex(content, searchdb)) loop.run_until_complete(self.__aindex(content, searchdb))
searchdb.finish() searchdb.finish()

View file

@ -9,8 +9,6 @@ import sanic.response
from sanic.log import log as logging from sanic.log import log as logging
from whoosh import index from whoosh import index
from whoosh import qparser from whoosh import qparser
from whoosh import fields
from whoosh import analysis
import jinja2 import jinja2
import shared import shared

View file

@ -70,6 +70,9 @@ schema = fields.Schema(
), ),
img=fields.TEXT( img=fields.TEXT(
stored=True stored=True
),
mtime=fields.NUMERIC(
stored=True
) )
) )