better indexer which only updates a document if it's mtime has changed
This commit is contained in:
parent
558195288d
commit
c6bae2837b
3 changed files with 110 additions and 29 deletions
134
nasg.py
134
nasg.py
|
@ -26,6 +26,7 @@ import langdetect
|
||||||
import requests
|
import requests
|
||||||
from breadability.readable import Article
|
from breadability.readable import Article
|
||||||
from whoosh import index
|
from whoosh import index
|
||||||
|
from whoosh import qparser
|
||||||
import jinja2
|
import jinja2
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import shared
|
import shared
|
||||||
|
@ -38,27 +39,94 @@ def splitpath(path):
|
||||||
(path,tail) = os.path.split(path)
|
(path,tail) = os.path.split(path)
|
||||||
return parts
|
return parts
|
||||||
|
|
||||||
class Indexer(object):
|
#class Indexer(object):
|
||||||
|
|
||||||
|
#def __init__(self):
|
||||||
|
#self.tmp = tempfile.mkdtemp(
|
||||||
|
#'whooshdb_',
|
||||||
|
#dir=tempfile.gettempdir()
|
||||||
|
#)
|
||||||
|
#atexit.register(
|
||||||
|
#shutil.rmtree,
|
||||||
|
#os.path.abspath(self.tmp)
|
||||||
|
#)
|
||||||
|
#self.ix = index.create_in(self.tmp, shared.schema)
|
||||||
|
#self.target = os.path.abspath(os.path.join(
|
||||||
|
#shared.config.get('target', 'builddir'),
|
||||||
|
#shared.config.get('var', 'searchdb')
|
||||||
|
#))
|
||||||
|
#self.writer = self.ix.writer()
|
||||||
|
|
||||||
|
|
||||||
|
#async def append(self, singular):
|
||||||
|
#logging.info("appending search index with %s", singular.fname)
|
||||||
|
|
||||||
|
#content_real = [
|
||||||
|
#singular.fname,
|
||||||
|
#singular.summary,
|
||||||
|
#singular.content,
|
||||||
|
#]
|
||||||
|
|
||||||
|
#content_remote = []
|
||||||
|
#for url, offlinecopy in singular.offlinecopies.items():
|
||||||
|
#content_remote.append("%s" % offlinecopy)
|
||||||
|
|
||||||
|
#weight = 1
|
||||||
|
#if singular.isbookmark:
|
||||||
|
#weight = 10
|
||||||
|
#if singular.ispage:
|
||||||
|
#weight = 100
|
||||||
|
|
||||||
|
#self.writer.add_document(
|
||||||
|
#title=singular.title,
|
||||||
|
#url=singular.url,
|
||||||
|
#content=" ".join(list(map(str,[*content_real, *content_remote]))),
|
||||||
|
#date=singular.published.datetime,
|
||||||
|
#tags=",".join(list(map(str, singular.tags))),
|
||||||
|
#weight=weight,
|
||||||
|
#img="%s" % singular.photo
|
||||||
|
#)
|
||||||
|
|
||||||
|
#def finish(self):
|
||||||
|
#self.writer.commit()
|
||||||
|
#if os.path.isdir(self.target):
|
||||||
|
#shutil.rmtree(self.target)
|
||||||
|
#shutil.copytree(self.tmp, self.target)
|
||||||
|
|
||||||
|
|
||||||
|
class SmartIndexer(object):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.tmp = tempfile.mkdtemp(
|
|
||||||
'whooshdb_',
|
|
||||||
dir=tempfile.gettempdir()
|
|
||||||
)
|
|
||||||
atexit.register(
|
|
||||||
shutil.rmtree,
|
|
||||||
os.path.abspath(self.tmp)
|
|
||||||
)
|
|
||||||
self.ix = index.create_in(self.tmp, shared.schema)
|
|
||||||
self.target = os.path.abspath(os.path.join(
|
self.target = os.path.abspath(os.path.join(
|
||||||
shared.config.get('target', 'builddir'),
|
shared.config.get('target', 'builddir'),
|
||||||
shared.config.get('var', 'searchdb')
|
shared.config.get('var', 'searchdb')
|
||||||
))
|
))
|
||||||
self.writer = self.ix.writer()
|
if not os.path.isdir(self.target):
|
||||||
|
os.mkdir(self.target)
|
||||||
|
|
||||||
|
if index.exists_in(self.target):
|
||||||
|
self.ix = index.open_dir(self.target)
|
||||||
|
else:
|
||||||
|
self.ix = index.create_in(self.target, shared.schema)
|
||||||
|
self.writer = self.ix.writer()
|
||||||
|
self.qp = qparser.QueryParser("url", schema=shared.schema)
|
||||||
|
|
||||||
async def append(self, singular):
|
async def append(self, singular):
|
||||||
logging.info("appending search index with %s", singular.fname)
|
logging.debug("searching for existing index for %s", singular.fname)
|
||||||
|
exists = False
|
||||||
|
|
||||||
|
q = self.qp.parse(singular.url)
|
||||||
|
r = self.ix.searcher().search(q, limit=1)
|
||||||
|
if r:
|
||||||
|
r = r[0]
|
||||||
|
# nothing to do, the entry is present and is up to date
|
||||||
|
ixtime = r['mtime']
|
||||||
|
if int(ixtime) == int(singular.mtime):
|
||||||
|
logging.info("search index is up to date for %s", singular.fname)
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
logging.info("search index is out of date: %d (indexed) vs %d", ixtime, singular.mtime)
|
||||||
|
exists = True
|
||||||
|
|
||||||
content_real = [
|
content_real = [
|
||||||
singular.fname,
|
singular.fname,
|
||||||
|
@ -76,21 +144,33 @@ class Indexer(object):
|
||||||
if singular.ispage:
|
if singular.ispage:
|
||||||
weight = 100
|
weight = 100
|
||||||
|
|
||||||
self.writer.add_document(
|
if exists:
|
||||||
title=singular.title,
|
logging.info("updating search index with %s", singular.fname)
|
||||||
url=singular.url,
|
self.writer.add_document(
|
||||||
content=" ".join(list(map(str,[*content_real, *content_remote]))),
|
title=singular.title,
|
||||||
date=singular.published.datetime,
|
url=singular.url,
|
||||||
tags=",".join(list(map(str, singular.tags))),
|
content=" ".join(list(map(str,[*content_real, *content_remote]))),
|
||||||
weight=weight,
|
date=singular.published.datetime,
|
||||||
img="%s" % singular.photo
|
tags=",".join(list(map(str, singular.tags))),
|
||||||
)
|
weight=weight,
|
||||||
|
img="%s" % singular.photo,
|
||||||
|
mtime=singular.mtime,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logging.info("appending search index with %s", singular.fname)
|
||||||
|
self.writer.update_document(
|
||||||
|
title=singular.title,
|
||||||
|
url=singular.url,
|
||||||
|
content=" ".join(list(map(str,[*content_real, *content_remote]))),
|
||||||
|
date=singular.published.datetime,
|
||||||
|
tags=",".join(list(map(str, singular.tags))),
|
||||||
|
weight=weight,
|
||||||
|
img="%s" % singular.photo,
|
||||||
|
mtime=singular.mtime
|
||||||
|
)
|
||||||
|
|
||||||
def finish(self):
|
def finish(self):
|
||||||
self.writer.commit()
|
self.writer.commit()
|
||||||
if os.path.isdir(self.target):
|
|
||||||
shutil.rmtree(self.target)
|
|
||||||
shutil.copytree(self.tmp, self.target)
|
|
||||||
|
|
||||||
class OfflineCopy(object):
|
class OfflineCopy(object):
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
|
@ -160,7 +240,7 @@ class OfflineCopy(object):
|
||||||
doc = Article(r.text, url=self.url)
|
doc = Article(r.text, url=self.url)
|
||||||
self.fm.metadata['title'] = doc._original_document.title
|
self.fm.metadata['title'] = doc._original_document.title
|
||||||
self.fm.metadata['realurl'] = r.url
|
self.fm.metadata['realurl'] = r.url
|
||||||
self.fm.content = Pandoc(False).convert(doc.readable)
|
self.fm.content = shared.Pandoc(False).convert(doc.readable)
|
||||||
self.write()
|
self.write()
|
||||||
|
|
||||||
|
|
||||||
|
@ -1046,7 +1126,7 @@ class Singular(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def html(self):
|
def html(self):
|
||||||
return Pandoc().convert(self.content)
|
return shared.Pandoc().convert(self.content)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def offlinecopies(self):
|
def offlinecopies(self):
|
||||||
|
@ -1330,7 +1410,7 @@ class NASG(object):
|
||||||
shutil.copy2(s, d)
|
shutil.copy2(s, d)
|
||||||
|
|
||||||
logging.info("pouplating searchdb")
|
logging.info("pouplating searchdb")
|
||||||
searchdb = Indexer()
|
searchdb = SmartIndexer()
|
||||||
loop.run_until_complete(self.__aindex(content, searchdb))
|
loop.run_until_complete(self.__aindex(content, searchdb))
|
||||||
searchdb.finish()
|
searchdb.finish()
|
||||||
|
|
||||||
|
|
|
@ -9,8 +9,6 @@ import sanic.response
|
||||||
from sanic.log import log as logging
|
from sanic.log import log as logging
|
||||||
from whoosh import index
|
from whoosh import index
|
||||||
from whoosh import qparser
|
from whoosh import qparser
|
||||||
from whoosh import fields
|
|
||||||
from whoosh import analysis
|
|
||||||
import jinja2
|
import jinja2
|
||||||
import shared
|
import shared
|
||||||
|
|
||||||
|
|
|
@ -70,6 +70,9 @@ schema = fields.Schema(
|
||||||
),
|
),
|
||||||
img=fields.TEXT(
|
img=fields.TEXT(
|
||||||
stored=True
|
stored=True
|
||||||
|
),
|
||||||
|
mtime=fields.NUMERIC(
|
||||||
|
stored=True
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue