better indexer which only updates a document if it's mtime has changed
This commit is contained in:
parent
558195288d
commit
c6bae2837b
3 changed files with 110 additions and 29 deletions
134
nasg.py
134
nasg.py
|
@ -26,6 +26,7 @@ import langdetect
|
|||
import requests
|
||||
from breadability.readable import Article
|
||||
from whoosh import index
|
||||
from whoosh import qparser
|
||||
import jinja2
|
||||
import urllib.parse
|
||||
import shared
|
||||
|
@ -38,27 +39,94 @@ def splitpath(path):
|
|||
(path,tail) = os.path.split(path)
|
||||
return parts
|
||||
|
||||
class Indexer(object):
|
||||
#class Indexer(object):
|
||||
|
||||
#def __init__(self):
|
||||
#self.tmp = tempfile.mkdtemp(
|
||||
#'whooshdb_',
|
||||
#dir=tempfile.gettempdir()
|
||||
#)
|
||||
#atexit.register(
|
||||
#shutil.rmtree,
|
||||
#os.path.abspath(self.tmp)
|
||||
#)
|
||||
#self.ix = index.create_in(self.tmp, shared.schema)
|
||||
#self.target = os.path.abspath(os.path.join(
|
||||
#shared.config.get('target', 'builddir'),
|
||||
#shared.config.get('var', 'searchdb')
|
||||
#))
|
||||
#self.writer = self.ix.writer()
|
||||
|
||||
|
||||
#async def append(self, singular):
|
||||
#logging.info("appending search index with %s", singular.fname)
|
||||
|
||||
#content_real = [
|
||||
#singular.fname,
|
||||
#singular.summary,
|
||||
#singular.content,
|
||||
#]
|
||||
|
||||
#content_remote = []
|
||||
#for url, offlinecopy in singular.offlinecopies.items():
|
||||
#content_remote.append("%s" % offlinecopy)
|
||||
|
||||
#weight = 1
|
||||
#if singular.isbookmark:
|
||||
#weight = 10
|
||||
#if singular.ispage:
|
||||
#weight = 100
|
||||
|
||||
#self.writer.add_document(
|
||||
#title=singular.title,
|
||||
#url=singular.url,
|
||||
#content=" ".join(list(map(str,[*content_real, *content_remote]))),
|
||||
#date=singular.published.datetime,
|
||||
#tags=",".join(list(map(str, singular.tags))),
|
||||
#weight=weight,
|
||||
#img="%s" % singular.photo
|
||||
#)
|
||||
|
||||
#def finish(self):
|
||||
#self.writer.commit()
|
||||
#if os.path.isdir(self.target):
|
||||
#shutil.rmtree(self.target)
|
||||
#shutil.copytree(self.tmp, self.target)
|
||||
|
||||
|
||||
class SmartIndexer(object):
|
||||
|
||||
def __init__(self):
|
||||
self.tmp = tempfile.mkdtemp(
|
||||
'whooshdb_',
|
||||
dir=tempfile.gettempdir()
|
||||
)
|
||||
atexit.register(
|
||||
shutil.rmtree,
|
||||
os.path.abspath(self.tmp)
|
||||
)
|
||||
self.ix = index.create_in(self.tmp, shared.schema)
|
||||
self.target = os.path.abspath(os.path.join(
|
||||
shared.config.get('target', 'builddir'),
|
||||
shared.config.get('var', 'searchdb')
|
||||
))
|
||||
self.writer = self.ix.writer()
|
||||
if not os.path.isdir(self.target):
|
||||
os.mkdir(self.target)
|
||||
|
||||
if index.exists_in(self.target):
|
||||
self.ix = index.open_dir(self.target)
|
||||
else:
|
||||
self.ix = index.create_in(self.target, shared.schema)
|
||||
self.writer = self.ix.writer()
|
||||
self.qp = qparser.QueryParser("url", schema=shared.schema)
|
||||
|
||||
async def append(self, singular):
|
||||
logging.info("appending search index with %s", singular.fname)
|
||||
logging.debug("searching for existing index for %s", singular.fname)
|
||||
exists = False
|
||||
|
||||
q = self.qp.parse(singular.url)
|
||||
r = self.ix.searcher().search(q, limit=1)
|
||||
if r:
|
||||
r = r[0]
|
||||
# nothing to do, the entry is present and is up to date
|
||||
ixtime = r['mtime']
|
||||
if int(ixtime) == int(singular.mtime):
|
||||
logging.info("search index is up to date for %s", singular.fname)
|
||||
return
|
||||
else:
|
||||
logging.info("search index is out of date: %d (indexed) vs %d", ixtime, singular.mtime)
|
||||
exists = True
|
||||
|
||||
content_real = [
|
||||
singular.fname,
|
||||
|
@ -76,21 +144,33 @@ class Indexer(object):
|
|||
if singular.ispage:
|
||||
weight = 100
|
||||
|
||||
self.writer.add_document(
|
||||
title=singular.title,
|
||||
url=singular.url,
|
||||
content=" ".join(list(map(str,[*content_real, *content_remote]))),
|
||||
date=singular.published.datetime,
|
||||
tags=",".join(list(map(str, singular.tags))),
|
||||
weight=weight,
|
||||
img="%s" % singular.photo
|
||||
)
|
||||
if exists:
|
||||
logging.info("updating search index with %s", singular.fname)
|
||||
self.writer.add_document(
|
||||
title=singular.title,
|
||||
url=singular.url,
|
||||
content=" ".join(list(map(str,[*content_real, *content_remote]))),
|
||||
date=singular.published.datetime,
|
||||
tags=",".join(list(map(str, singular.tags))),
|
||||
weight=weight,
|
||||
img="%s" % singular.photo,
|
||||
mtime=singular.mtime,
|
||||
)
|
||||
else:
|
||||
logging.info("appending search index with %s", singular.fname)
|
||||
self.writer.update_document(
|
||||
title=singular.title,
|
||||
url=singular.url,
|
||||
content=" ".join(list(map(str,[*content_real, *content_remote]))),
|
||||
date=singular.published.datetime,
|
||||
tags=",".join(list(map(str, singular.tags))),
|
||||
weight=weight,
|
||||
img="%s" % singular.photo,
|
||||
mtime=singular.mtime
|
||||
)
|
||||
|
||||
def finish(self):
|
||||
self.writer.commit()
|
||||
if os.path.isdir(self.target):
|
||||
shutil.rmtree(self.target)
|
||||
shutil.copytree(self.tmp, self.target)
|
||||
|
||||
class OfflineCopy(object):
|
||||
def __init__(self, url):
|
||||
|
@ -160,7 +240,7 @@ class OfflineCopy(object):
|
|||
doc = Article(r.text, url=self.url)
|
||||
self.fm.metadata['title'] = doc._original_document.title
|
||||
self.fm.metadata['realurl'] = r.url
|
||||
self.fm.content = Pandoc(False).convert(doc.readable)
|
||||
self.fm.content = shared.Pandoc(False).convert(doc.readable)
|
||||
self.write()
|
||||
|
||||
|
||||
|
@ -1046,7 +1126,7 @@ class Singular(object):
|
|||
|
||||
@property
|
||||
def html(self):
|
||||
return Pandoc().convert(self.content)
|
||||
return shared.Pandoc().convert(self.content)
|
||||
|
||||
@property
|
||||
def offlinecopies(self):
|
||||
|
@ -1330,7 +1410,7 @@ class NASG(object):
|
|||
shutil.copy2(s, d)
|
||||
|
||||
logging.info("pouplating searchdb")
|
||||
searchdb = Indexer()
|
||||
searchdb = SmartIndexer()
|
||||
loop.run_until_complete(self.__aindex(content, searchdb))
|
||||
searchdb.finish()
|
||||
|
||||
|
|
|
@ -9,8 +9,6 @@ import sanic.response
|
|||
from sanic.log import log as logging
|
||||
from whoosh import index
|
||||
from whoosh import qparser
|
||||
from whoosh import fields
|
||||
from whoosh import analysis
|
||||
import jinja2
|
||||
import shared
|
||||
|
||||
|
|
|
@ -70,6 +70,9 @@ schema = fields.Schema(
|
|||
),
|
||||
img=fields.TEXT(
|
||||
stored=True
|
||||
),
|
||||
mtime=fields.NUMERIC(
|
||||
stored=True
|
||||
)
|
||||
)
|
||||
|
||||
|
|
Loading…
Reference in a new issue