From 40c334610d49d69e81cf589fea6de8517c80f163 Mon Sep 17 00:00:00 2001 From: Peter Molnar Date: Thu, 1 Jun 2017 11:19:32 +0000 Subject: [PATCH] better search: added NGRAM tokenized text to search fragments --- nasg.py | 8 +++++--- search.py | 2 +- shared.py | 5 ++++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/nasg.py b/nasg.py index 367d2db..717c43d 100755 --- a/nasg.py +++ b/nasg.py @@ -44,7 +44,7 @@ def splitpath(path): return parts -class SmartIndexer(object): +class Indexer(object): def __init__(self): self.target = os.path.abspath(os.path.join( @@ -94,12 +94,14 @@ class SmartIndexer(object): if singular.ispage: weight = 100 + content = " ".join(list(map(str,[*content_real, *content_remote]))) if exists: logging.info("updating search index with %s", singular.fname) self.writer.add_document( title=singular.title, url=singular.url, - content=" ".join(list(map(str,[*content_real, *content_remote]))), + content=content, + fuzzy=content, date=singular.published.datetime, tags=",".join(list(map(str, singular.tags))), weight=weight, @@ -1355,7 +1357,7 @@ class NASG(object): shutil.copy2(s, d) logging.info("pouplating searchdb") - searchdb = SmartIndexer() + searchdb = Indexer() loop.run_until_complete(self.__aindex(content, searchdb)) searchdb.finish() diff --git a/search.py b/search.py index e1cc10d..20c5162 100644 --- a/search.py +++ b/search.py @@ -28,7 +28,7 @@ def SearchHandler(query, tmpl): ))) qp = qparser.MultifieldParser( - ["title", "content", "tags"], + ["title", "content","fuzzy", "tags"], schema = shared.schema ) diff --git a/shared.py b/shared.py index 0acf2ab..f5b51e4 100644 --- a/shared.py +++ b/shared.py @@ -57,12 +57,15 @@ schema = fields.Schema( stored=True, analyzer=analysis.FancyAnalyzer() ), + fuzzy=fields.NGRAMWORDS( + tokenizer=analysis.NgramTokenizer(4) + ), tags=fields.TEXT( stored=True, analyzer=analysis.KeywordAnalyzer( lowercase=True, commas=True - ) + ), ), weight=fields.NUMERIC( sortable=True