diff --git a/elasticlunr.js b/elasticlunr.js new file mode 100644 index 0000000..e43b0cd --- /dev/null +++ b/elasticlunr.js @@ -0,0 +1,2507 @@ +/** + * elasticlunr - http://weixsong.github.io + * Lightweight full-text search engine in Javascript for browser search and offline search. - 0.9.5 + * + * Copyright (C) 2017 Oliver Nightingale + * Copyright (C) 2017 Wei Song + * MIT Licensed + * @license + */ + +(function(){ + +/*! + * elasticlunr.js + * Copyright (C) 2017 Oliver Nightingale + * Copyright (C) 2017 Wei Song + */ + +/** + * Convenience function for instantiating a new elasticlunr index and configuring it + * with the default pipeline functions and the passed config function. + * + * When using this convenience function a new index will be created with the + * following functions already in the pipeline: + * + * 1. elasticlunr.trimmer - trim non-word character + * 2. elasticlunr.StopWordFilter - filters out any stop words before they enter the + * index + * 3. elasticlunr.stemmer - stems the tokens before entering the index. + * + * + * Example: + * + * var idx = elasticlunr(function () { + * this.addField('id'); + * this.addField('title'); + * this.addField('body'); + * + * //this.setRef('id'); // default ref is 'id' + * + * this.pipeline.add(function () { + * // some custom pipeline function + * }); + * }); + * + * idx.addDoc({ + * id: 1, + * title: 'Oracle released database 12g', + * body: 'Yestaday, Oracle has released their latest database, named 12g, more robust. this product will increase Oracle profit.' + * }); + * + * idx.addDoc({ + * id: 2, + * title: 'Oracle released annual profit report', + * body: 'Yestaday, Oracle has released their annual profit report of 2015, total profit is 12.5 Billion.' + * }); + * + * # simple search + * idx.search('oracle database'); + * + * # search with query-time boosting + * idx.search('oracle database', {fields: {title: {boost: 2}, body: {boost: 1}}}); + * + * @param {Function} config A function that will be called with the new instance + * of the elasticlunr.Index as both its context and first parameter. It can be used to + * customize the instance of new elasticlunr.Index. + * @namespace + * @module + * @return {elasticlunr.Index} + * + */ +var elasticlunr = function (config) { + var idx = new elasticlunr.Index; + + idx.pipeline.add( + elasticlunr.trimmer, + elasticlunr.stopWordFilter, + elasticlunr.stemmer + ); + + if (config) config.call(idx, idx); + + return idx; +}; + +elasticlunr.version = "0.9.5"; + +// only used this to make elasticlunr.js compatible with lunr-languages +// this is a trick to define a global alias of elasticlunr +lunr = elasticlunr; + +/*! + * elasticlunr.utils + * Copyright (C) 2017 Oliver Nightingale + * Copyright (C) 2017 Wei Song + */ + +/** + * A namespace containing utils for the rest of the elasticlunr library + */ +elasticlunr.utils = {}; + +/** + * Print a warning message to the console. + * + * @param {String} message The message to be printed. + * @memberOf Utils + */ +elasticlunr.utils.warn = (function (global) { + return function (message) { + if (global.console && console.warn) { + console.warn(message); + } + }; +})(this); + +/** + * Convert an object to string. + * + * In the case of `null` and `undefined` the function returns + * an empty string, in all other cases the result of calling + * `toString` on the passed object is returned. + * + * @param {object} obj The object to convert to a string. + * @return {String} string representation of the passed object. + * @memberOf Utils + */ +elasticlunr.utils.toString = function (obj) { + if (obj === void 0 || obj === null) { + return ""; + } + + return obj.toString(); +}; +/*! + * elasticlunr.EventEmitter + * Copyright (C) 2017 Oliver Nightingale + * Copyright (C) 2017 Wei Song + */ + +/** + * elasticlunr.EventEmitter is an event emitter for elasticlunr. + * It manages adding and removing event handlers and triggering events and their handlers. + * + * Each event could has multiple corresponding functions, + * these functions will be called as the sequence that they are added into the event. + * + * @constructor + */ +elasticlunr.EventEmitter = function () { + this.events = {}; +}; + +/** + * Binds a handler function to a specific event(s). + * + * Can bind a single function to many different events in one call. + * + * @param {String} [eventName] The name(s) of events to bind this function to. + * @param {Function} fn The function to call when an event is fired. + * @memberOf EventEmitter + */ +elasticlunr.EventEmitter.prototype.addListener = function () { + var args = Array.prototype.slice.call(arguments), + fn = args.pop(), + names = args; + + if (typeof fn !== "function") throw new TypeError ("last argument must be a function"); + + names.forEach(function (name) { + if (!this.hasHandler(name)) this.events[name] = []; + this.events[name].push(fn); + }, this); +}; + +/** + * Removes a handler function from a specific event. + * + * @param {String} eventName The name of the event to remove this function from. + * @param {Function} fn The function to remove from an event. + * @memberOf EventEmitter + */ +elasticlunr.EventEmitter.prototype.removeListener = function (name, fn) { + if (!this.hasHandler(name)) return; + + var fnIndex = this.events[name].indexOf(fn); + if (fnIndex === -1) return; + + this.events[name].splice(fnIndex, 1); + + if (this.events[name].length == 0) delete this.events[name]; +}; + +/** + * Call all functions that bounded to the given event. + * + * Additional data can be passed to the event handler as arguments to `emit` + * after the event name. + * + * @param {String} eventName The name of the event to emit. + * @memberOf EventEmitter + */ +elasticlunr.EventEmitter.prototype.emit = function (name) { + if (!this.hasHandler(name)) return; + + var args = Array.prototype.slice.call(arguments, 1); + + this.events[name].forEach(function (fn) { + fn.apply(undefined, args); + }, this); +}; + +/** + * Checks whether a handler has ever been stored against an event. + * + * @param {String} eventName The name of the event to check. + * @private + * @memberOf EventEmitter + */ +elasticlunr.EventEmitter.prototype.hasHandler = function (name) { + return name in this.events; +}; +/*! + * elasticlunr.tokenizer + * Copyright (C) 2017 Oliver Nightingale + * Copyright (C) 2017 Wei Song + */ + +/** + * A function for splitting a string into tokens. + * Currently English is supported as default. + * Uses `elasticlunr.tokenizer.seperator` to split strings, you could change + * the value of this property to set how you want strings are split into tokens. + * IMPORTANT: use elasticlunr.tokenizer.seperator carefully, if you are not familiar with + * text process, then you'd better not change it. + * + * @module + * @param {String} str The string that you want to tokenize. + * @see elasticlunr.tokenizer.seperator + * @return {Array} + */ +elasticlunr.tokenizer = function (str) { + if (!arguments.length || str === null || str === undefined) return []; + if (Array.isArray(str)) { + var arr = str.filter(function(token) { + if (token === null || token === undefined) { + return false; + } + + return true; + }); + + arr = arr.map(function (t) { + return elasticlunr.utils.toString(t).toLowerCase(); + }); + + var out = []; + arr.forEach(function(item) { + var tokens = item.split(elasticlunr.tokenizer.seperator); + out = out.concat(tokens); + }, this); + + return out; + } + + return str.toString().trim().toLowerCase().split(elasticlunr.tokenizer.seperator); +}; + +/** + * Default string seperator. + */ +elasticlunr.tokenizer.defaultSeperator = /[\s\-]+/; + +/** + * The sperator used to split a string into tokens. Override this property to change the behaviour of + * `elasticlunr.tokenizer` behaviour when tokenizing strings. By default this splits on whitespace and hyphens. + * + * @static + * @see elasticlunr.tokenizer + */ +elasticlunr.tokenizer.seperator = elasticlunr.tokenizer.defaultSeperator; + +/** + * Set up customized string seperator + * + * @param {Object} sep The customized seperator that you want to use to tokenize a string. + */ +elasticlunr.tokenizer.setSeperator = function(sep) { + if (sep !== null && sep !== undefined && typeof(sep) === 'object') { + elasticlunr.tokenizer.seperator = sep; + } +} + +/** + * Reset string seperator + * + */ +elasticlunr.tokenizer.resetSeperator = function() { + elasticlunr.tokenizer.seperator = elasticlunr.tokenizer.defaultSeperator; +} + +/** + * Get string seperator + * + */ +elasticlunr.tokenizer.getSeperator = function() { + return elasticlunr.tokenizer.seperator; +} +/*! + * elasticlunr.Pipeline + * Copyright (C) 2017 Oliver Nightingale + * Copyright (C) 2017 Wei Song + */ + +/** + * elasticlunr.Pipelines maintain an ordered list of functions to be applied to + * both documents tokens and query tokens. + * + * An instance of elasticlunr.Index will contain a pipeline + * with a trimmer, a stop word filter, an English stemmer. Extra + * functions can be added before or after either of these functions or these + * default functions can be removed. + * + * When run the pipeline, it will call each function in turn. + * + * The output of the functions in the pipeline will be passed to the next function + * in the pipeline. To exclude a token from entering the index the function + * should return undefined, the rest of the pipeline will not be called with + * this token. + * + * For serialisation of pipelines to work, all functions used in an instance of + * a pipeline should be registered with elasticlunr.Pipeline. Registered functions can + * then be loaded. If trying to load a serialised pipeline that uses functions + * that are not registered an error will be thrown. + * + * If not planning on serialising the pipeline then registering pipeline functions + * is not necessary. + * + * @constructor + */ +elasticlunr.Pipeline = function () { + this._queue = []; +}; + +elasticlunr.Pipeline.registeredFunctions = {}; + +/** + * Register a function in the pipeline. + * + * Functions that are used in the pipeline should be registered if the pipeline + * needs to be serialised, or a serialised pipeline needs to be loaded. + * + * Registering a function does not add it to a pipeline, functions must still be + * added to instances of the pipeline for them to be used when running a pipeline. + * + * @param {Function} fn The function to register. + * @param {String} label The label to register this function with + * @memberOf Pipeline + */ +elasticlunr.Pipeline.registerFunction = function (fn, label) { + if (label in elasticlunr.Pipeline.registeredFunctions) { + elasticlunr.utils.warn('Overwriting existing registered function: ' + label); + } + + fn.label = label; + elasticlunr.Pipeline.registeredFunctions[label] = fn; +}; + +/** + * Get a registered function in the pipeline. + * + * @param {String} label The label of registered function. + * @return {Function} + * @memberOf Pipeline + */ +elasticlunr.Pipeline.getRegisteredFunction = function (label) { + if ((label in elasticlunr.Pipeline.registeredFunctions) !== true) { + return null; + } + + return elasticlunr.Pipeline.registeredFunctions[label]; +}; + +/** + * Warns if the function is not registered as a Pipeline function. + * + * @param {Function} fn The function to check for. + * @private + * @memberOf Pipeline + */ +elasticlunr.Pipeline.warnIfFunctionNotRegistered = function (fn) { + var isRegistered = fn.label && (fn.label in this.registeredFunctions); + + if (!isRegistered) { + elasticlunr.utils.warn('Function is not registered with pipeline. This may cause problems when serialising the index.\n', fn); + } +}; + +/** + * Loads a previously serialised pipeline. + * + * All functions to be loaded must already be registered with elasticlunr.Pipeline. + * If any function from the serialised data has not been registered then an + * error will be thrown. + * + * @param {Object} serialised The serialised pipeline to load. + * @return {elasticlunr.Pipeline} + * @memberOf Pipeline + */ +elasticlunr.Pipeline.load = function (serialised) { + var pipeline = new elasticlunr.Pipeline; + + serialised.forEach(function (fnName) { + var fn = elasticlunr.Pipeline.getRegisteredFunction(fnName); + + if (fn) { + pipeline.add(fn); + } else { + throw new Error('Cannot load un-registered function: ' + fnName); + } + }); + + return pipeline; +}; + +/** + * Adds new functions to the end of the pipeline. + * + * Logs a warning if the function has not been registered. + * + * @param {Function} functions Any number of functions to add to the pipeline. + * @memberOf Pipeline + */ +elasticlunr.Pipeline.prototype.add = function () { + var fns = Array.prototype.slice.call(arguments); + + fns.forEach(function (fn) { + elasticlunr.Pipeline.warnIfFunctionNotRegistered(fn); + this._queue.push(fn); + }, this); +}; + +/** + * Adds a single function after a function that already exists in the + * pipeline. + * + * Logs a warning if the function has not been registered. + * If existingFn is not found, throw an Exception. + * + * @param {Function} existingFn A function that already exists in the pipeline. + * @param {Function} newFn The new function to add to the pipeline. + * @memberOf Pipeline + */ +elasticlunr.Pipeline.prototype.after = function (existingFn, newFn) { + elasticlunr.Pipeline.warnIfFunctionNotRegistered(newFn); + + var pos = this._queue.indexOf(existingFn); + if (pos === -1) { + throw new Error('Cannot find existingFn'); + } + + this._queue.splice(pos + 1, 0, newFn); +}; + +/** + * Adds a single function before a function that already exists in the + * pipeline. + * + * Logs a warning if the function has not been registered. + * If existingFn is not found, throw an Exception. + * + * @param {Function} existingFn A function that already exists in the pipeline. + * @param {Function} newFn The new function to add to the pipeline. + * @memberOf Pipeline + */ +elasticlunr.Pipeline.prototype.before = function (existingFn, newFn) { + elasticlunr.Pipeline.warnIfFunctionNotRegistered(newFn); + + var pos = this._queue.indexOf(existingFn); + if (pos === -1) { + throw new Error('Cannot find existingFn'); + } + + this._queue.splice(pos, 0, newFn); +}; + +/** + * Removes a function from the pipeline. + * + * @param {Function} fn The function to remove from the pipeline. + * @memberOf Pipeline + */ +elasticlunr.Pipeline.prototype.remove = function (fn) { + var pos = this._queue.indexOf(fn); + if (pos === -1) { + return; + } + + this._queue.splice(pos, 1); +}; + +/** + * Runs the current list of functions that registered in the pipeline against the + * input tokens. + * + * @param {Array} tokens The tokens to run through the pipeline. + * @return {Array} + * @memberOf Pipeline + */ +elasticlunr.Pipeline.prototype.run = function (tokens) { + var out = [], + tokenLength = tokens.length, + pipelineLength = this._queue.length; + + for (var i = 0; i < tokenLength; i++) { + var token = tokens[i]; + + for (var j = 0; j < pipelineLength; j++) { + token = this._queue[j](token, i, tokens); + if (token === void 0 || token === null) break; + }; + + if (token !== void 0 && token !== null) out.push(token); + }; + + return out; +}; + +/** + * Resets the pipeline by removing any existing processors. + * + * @memberOf Pipeline + */ +elasticlunr.Pipeline.prototype.reset = function () { + this._queue = []; +}; + + /** + * Get the pipeline if user want to check the pipeline. + * + * @memberOf Pipeline + */ + elasticlunr.Pipeline.prototype.get = function () { + return this._queue; + }; + +/** + * Returns a representation of the pipeline ready for serialisation. + * Only serialize pipeline function's name. Not storing function, so when + * loading the archived JSON index file, corresponding pipeline function is + * added by registered function of elasticlunr.Pipeline.registeredFunctions + * + * Logs a warning if the function has not been registered. + * + * @return {Array} + * @memberOf Pipeline + */ +elasticlunr.Pipeline.prototype.toJSON = function () { + return this._queue.map(function (fn) { + elasticlunr.Pipeline.warnIfFunctionNotRegistered(fn); + return fn.label; + }); +}; +/*! + * elasticlunr.Index + * Copyright (C) 2017 Oliver Nightingale + * Copyright (C) 2017 Wei Song + */ + +/** + * elasticlunr.Index is object that manages a search index. It contains the indexes + * and stores all the tokens and document lookups. It also provides the main + * user facing API for the library. + * + * @constructor + */ +elasticlunr.Index = function () { + this._fields = []; + this._ref = 'id'; + this.pipeline = new elasticlunr.Pipeline; + this.documentStore = new elasticlunr.DocumentStore; + this.index = {}; + this.eventEmitter = new elasticlunr.EventEmitter; + this._idfCache = {}; + + this.on('add', 'remove', 'update', (function () { + this._idfCache = {}; + }).bind(this)); +}; + +/** + * Bind a handler to events being emitted by the index. + * + * The handler can be bound to many events at the same time. + * + * @param {String} [eventName] The name(s) of events to bind the function to. + * @param {Function} fn The serialised set to load. + * @memberOf Index + */ +elasticlunr.Index.prototype.on = function () { + var args = Array.prototype.slice.call(arguments); + return this.eventEmitter.addListener.apply(this.eventEmitter, args); +}; + +/** + * Removes a handler from an event being emitted by the index. + * + * @param {String} eventName The name of events to remove the function from. + * @param {Function} fn The serialised set to load. + * @memberOf Index + */ +elasticlunr.Index.prototype.off = function (name, fn) { + return this.eventEmitter.removeListener(name, fn); +}; + +/** + * Loads a previously serialised index. + * + * Issues a warning if the index being imported was serialised + * by a different version of elasticlunr. + * + * @param {Object} serialisedData The serialised set to load. + * @return {elasticlunr.Index} + * @memberOf Index + */ +elasticlunr.Index.load = function (serialisedData) { + if (serialisedData.version !== elasticlunr.version) { + elasticlunr.utils.warn('version mismatch: current ' + + elasticlunr.version + ' importing ' + serialisedData.version); + } + + var idx = new this; + + idx._fields = serialisedData.fields; + idx._ref = serialisedData.ref; + idx.documentStore = elasticlunr.DocumentStore.load(serialisedData.documentStore); + idx.pipeline = elasticlunr.Pipeline.load(serialisedData.pipeline); + idx.index = {}; + for (var field in serialisedData.index) { + idx.index[field] = elasticlunr.InvertedIndex.load(serialisedData.index[field]); + } + + return idx; +}; + +/** + * Adds a field to the list of fields that will be searchable within documents in the index. + * + * Remember that inner index is build based on field, which means each field has one inverted index. + * + * Fields should be added before any documents are added to the index, fields + * that are added after documents are added to the index will only apply to new + * documents added to the index. + * + * @param {String} fieldName The name of the field within the document that should be indexed + * @return {elasticlunr.Index} + * @memberOf Index + */ +elasticlunr.Index.prototype.addField = function (fieldName) { + this._fields.push(fieldName); + this.index[fieldName] = new elasticlunr.InvertedIndex; + return this; +}; + +/** + * Sets the property used to uniquely identify documents added to the index, + * by default this property is 'id'. + * + * This should only be changed before adding documents to the index, changing + * the ref property without resetting the index can lead to unexpected results. + * + * @param {String} refName The property to use to uniquely identify the + * documents in the index. + * @param {Boolean} emitEvent Whether to emit add events, defaults to true + * @return {elasticlunr.Index} + * @memberOf Index + */ +elasticlunr.Index.prototype.setRef = function (refName) { + this._ref = refName; + return this; +}; + +/** + * + * Set if the JSON format original documents are save into elasticlunr.DocumentStore + * + * Defaultly save all the original JSON documents. + * + * @param {Boolean} save Whether to save the original JSON documents. + * @return {elasticlunr.Index} + * @memberOf Index + */ +elasticlunr.Index.prototype.saveDocument = function (save) { + this.documentStore = new elasticlunr.DocumentStore(save); + return this; +}; + +/** + * Add a JSON format document to the index. + * + * This is the way new documents enter the index, this function will run the + * fields from the document through the index's pipeline and then add it to + * the index, it will then show up in search results. + * + * An 'add' event is emitted with the document that has been added and the index + * the document has been added to. This event can be silenced by passing false + * as the second argument to add. + * + * @param {Object} doc The JSON format document to add to the index. + * @param {Boolean} emitEvent Whether or not to emit events, default true. + * @memberOf Index + */ +elasticlunr.Index.prototype.addDoc = function (doc, emitEvent) { + if (!doc) return; + var emitEvent = emitEvent === undefined ? true : emitEvent; + + var docRef = doc[this._ref]; + + this.documentStore.addDoc(docRef, doc); + this._fields.forEach(function (field) { + var fieldTokens = this.pipeline.run(elasticlunr.tokenizer(doc[field])); + this.documentStore.addFieldLength(docRef, field, fieldTokens.length); + + var tokenCount = {}; + fieldTokens.forEach(function (token) { + if (token in tokenCount) tokenCount[token] += 1; + else tokenCount[token] = 1; + }, this); + + for (var token in tokenCount) { + var termFrequency = tokenCount[token]; + termFrequency = Math.sqrt(termFrequency); + this.index[field].addToken(token, { ref: docRef, tf: termFrequency }); + } + }, this); + + if (emitEvent) this.eventEmitter.emit('add', doc, this); +}; + +/** + * Removes a document from the index by doc ref. + * + * To make sure documents no longer show up in search results they can be + * removed from the index using this method. + * + * A 'remove' event is emitted with the document that has been removed and the index + * the document has been removed from. This event can be silenced by passing false + * as the second argument to remove. + * + * If user setting DocumentStore not storing the documents, then remove doc by docRef is not allowed. + * + * @param {String|Integer} docRef The document ref to remove from the index. + * @param {Boolean} emitEvent Whether to emit remove events, defaults to true + * @memberOf Index + */ +elasticlunr.Index.prototype.removeDocByRef = function (docRef, emitEvent) { + if (!docRef) return; + if (this.documentStore.isDocStored() === false) { + return; + } + + if (!this.documentStore.hasDoc(docRef)) return; + var doc = this.documentStore.getDoc(docRef); + this.removeDoc(doc, false); +}; + +/** + * Removes a document from the index. + * This remove operation could work even the original doc is not store in the DocumentStore. + * + * To make sure documents no longer show up in search results they can be + * removed from the index using this method. + * + * A 'remove' event is emitted with the document that has been removed and the index + * the document has been removed from. This event can be silenced by passing false + * as the second argument to remove. + * + * + * @param {Object} doc The document ref to remove from the index. + * @param {Boolean} emitEvent Whether to emit remove events, defaults to true + * @memberOf Index + */ +elasticlunr.Index.prototype.removeDoc = function (doc, emitEvent) { + if (!doc) return; + + var emitEvent = emitEvent === undefined ? true : emitEvent; + + var docRef = doc[this._ref]; + if (!this.documentStore.hasDoc(docRef)) return; + + this.documentStore.removeDoc(docRef); + + this._fields.forEach(function (field) { + var fieldTokens = this.pipeline.run(elasticlunr.tokenizer(doc[field])); + fieldTokens.forEach(function (token) { + this.index[field].removeToken(token, docRef); + }, this); + }, this); + + if (emitEvent) this.eventEmitter.emit('remove', doc, this); +}; + +/** + * Updates a document in the index. + * + * When a document contained within the index gets updated, fields changed, + * added or removed, to make sure it correctly matched against search queries, + * it should be updated in the index. + * + * This method is just a wrapper around `remove` and `add` + * + * An 'update' event is emitted with the document that has been updated and the index. + * This event can be silenced by passing false as the second argument to update. Only + * an update event will be fired, the 'add' and 'remove' events of the underlying calls + * are silenced. + * + * @param {Object} doc The document to update in the index. + * @param {Boolean} emitEvent Whether to emit update events, defaults to true + * @see Index.prototype.remove + * @see Index.prototype.add + * @memberOf Index + */ +elasticlunr.Index.prototype.updateDoc = function (doc, emitEvent) { + var emitEvent = emitEvent === undefined ? true : emitEvent; + + this.removeDocByRef(doc[this._ref], false); + this.addDoc(doc, false); + + if (emitEvent) this.eventEmitter.emit('update', doc, this); +}; + +/** + * Calculates the inverse document frequency for a token within the index of a field. + * + * @param {String} token The token to calculate the idf of. + * @param {String} field The field to compute idf. + * @see Index.prototype.idf + * @private + * @memberOf Index + */ +elasticlunr.Index.prototype.idf = function (term, field) { + var cacheKey = "@" + field + '/' + term; + if (Object.prototype.hasOwnProperty.call(this._idfCache, cacheKey)) return this._idfCache[cacheKey]; + + var df = this.index[field].getDocFreq(term); + var idf = 1 + Math.log(this.documentStore.length / (df + 1)); + this._idfCache[cacheKey] = idf; + + return idf; +}; + +/** + * get fields of current index instance + * + * @return {Array} + */ +elasticlunr.Index.prototype.getFields = function () { + return this._fields.slice(); +}; + +/** + * Searches the index using the passed query. + * Queries should be a string, multiple words are allowed. + * + * If config is null, will search all fields defaultly, and lead to OR based query. + * If config is specified, will search specified with query time boosting. + * + * All query tokens are passed through the same pipeline that document tokens + * are passed through, so any language processing involved will be run on every + * query term. + * + * Each query term is expanded, so that the term 'he' might be expanded to + * 'hello' and 'help' if those terms were already included in the index. + * + * Matching documents are returned as an array of objects, each object contains + * the matching document ref, as set for this index, and the similarity score + * for this document against the query. + * + * @param {String} query The query to search the index with. + * @param {JSON} userConfig The user query config, JSON format. + * @return {Object} + * @see Index.prototype.idf + * @see Index.prototype.documentVector + * @memberOf Index + */ +elasticlunr.Index.prototype.search = function (query, userConfig) { + if (!query) return []; + if (typeof query === 'string') { + query = {any: query}; + } else { + query = JSON.parse(JSON.stringify(query)); + } + + var configStr = null; + if (userConfig != null) { + configStr = JSON.stringify(userConfig); + } + + var config = new elasticlunr.Configuration(configStr, this.getFields()).get(); + + var queryTokens = {}; + var queryFields = Object.keys(query); + + for (var i = 0; i < queryFields.length; i++) { + var key = queryFields[i]; + + queryTokens[key] = this.pipeline.run(elasticlunr.tokenizer(query[key])); + } + + var queryResults = {}; + + for (var field in config) { + var tokens = queryTokens[field] || queryTokens.any; + if (!tokens) { + continue; + } + + var fieldSearchResults = this.fieldSearch(tokens, field, config); + var fieldBoost = config[field].boost; + + for (var docRef in fieldSearchResults) { + fieldSearchResults[docRef] = fieldSearchResults[docRef] * fieldBoost; + } + + for (var docRef in fieldSearchResults) { + if (docRef in queryResults) { + queryResults[docRef] += fieldSearchResults[docRef]; + } else { + queryResults[docRef] = fieldSearchResults[docRef]; + } + } + } + + var results = []; + var result; + for (var docRef in queryResults) { + result = {ref: docRef, score: queryResults[docRef]}; + if (this.documentStore.hasDoc(docRef)) { + result.doc = this.documentStore.getDoc(docRef); + } + results.push(result); + } + + results.sort(function (a, b) { return b.score - a.score; }); + return results; +}; + +/** + * search queryTokens in specified field. + * + * @param {Array} queryTokens The query tokens to query in this field. + * @param {String} field Field to query in. + * @param {elasticlunr.Configuration} config The user query config, JSON format. + * @return {Object} + */ +elasticlunr.Index.prototype.fieldSearch = function (queryTokens, fieldName, config) { + var booleanType = config[fieldName].bool; + var expand = config[fieldName].expand; + var boost = config[fieldName].boost; + var scores = null; + var docTokens = {}; + + // Do nothing if the boost is 0 + if (boost === 0) { + return; + } + + queryTokens.forEach(function (token) { + var tokens = [token]; + if (expand == true) { + tokens = this.index[fieldName].expandToken(token); + } + // Consider every query token in turn. If expanded, each query token + // corresponds to a set of tokens, which is all tokens in the + // index matching the pattern queryToken* . + // For the set of tokens corresponding to a query token, find and score + // all matching documents. Store those scores in queryTokenScores, + // keyed by docRef. + // Then, depending on the value of booleanType, combine the scores + // for this query token with previous scores. If booleanType is OR, + // then merge the scores by summing into the accumulated total, adding + // new document scores are required (effectively a union operator). + // If booleanType is AND, accumulate scores only if the document + // has previously been scored by another query token (an intersection + // operation0. + // Furthermore, since when booleanType is AND, additional + // query tokens can't add new documents to the result set, use the + // current document set to limit the processing of each new query + // token for efficiency (i.e., incremental intersection). + + var queryTokenScores = {}; + tokens.forEach(function (key) { + var docs = this.index[fieldName].getDocs(key); + var idf = this.idf(key, fieldName); + + if (scores && booleanType == 'AND') { + // special case, we can rule out documents that have been + // already been filtered out because they weren't scored + // by previous query token passes. + var filteredDocs = {}; + for (var docRef in scores) { + if (docRef in docs) { + filteredDocs[docRef] = docs[docRef]; + } + } + docs = filteredDocs; + } + // only record appeared token for retrieved documents for the + // original token, not for expaned token. + // beause for doing coordNorm for a retrieved document, coordNorm only care how many + // query token appear in that document. + // so expanded token should not be added into docTokens, if added, this will pollute the + // coordNorm + if (key == token) { + this.fieldSearchStats(docTokens, key, docs); + } + + for (var docRef in docs) { + var tf = this.index[fieldName].getTermFrequency(key, docRef); + var fieldLength = this.documentStore.getFieldLength(docRef, fieldName); + var fieldLengthNorm = 1; + if (fieldLength != 0) { + fieldLengthNorm = 1 / Math.sqrt(fieldLength); + } + + var penality = 1; + if (key != token) { + // currently I'm not sure if this penality is enough, + // need to do verification + penality = (1 - (key.length - token.length) / key.length) * 0.15; + } + + var score = tf * idf * fieldLengthNorm * penality; + + if (docRef in queryTokenScores) { + queryTokenScores[docRef] += score; + } else { + queryTokenScores[docRef] = score; + } + } + }, this); + + scores = this.mergeScores(scores, queryTokenScores, booleanType); + }, this); + + scores = this.coordNorm(scores, docTokens, queryTokens.length); + return scores; +}; + +/** + * Merge the scores from one set of tokens into an accumulated score table. + * Exact operation depends on the op parameter. If op is 'AND', then only the + * intersection of the two score lists is retained. Otherwise, the union of + * the two score lists is returned. For internal use only. + * + * @param {Object} bool accumulated scores. Should be null on first call. + * @param {String} scores new scores to merge into accumScores. + * @param {Object} op merge operation (should be 'AND' or 'OR'). + * + */ + +elasticlunr.Index.prototype.mergeScores = function (accumScores, scores, op) { + if (!accumScores) { + return scores; + } + if (op == 'AND') { + var intersection = {}; + for (var docRef in scores) { + if (docRef in accumScores) { + intersection[docRef] = accumScores[docRef] + scores[docRef]; + } + } + return intersection; + } else { + for (var docRef in scores) { + if (docRef in accumScores) { + accumScores[docRef] += scores[docRef]; + } else { + accumScores[docRef] = scores[docRef]; + } + } + return accumScores; + } +}; + + +/** + * Record the occuring query token of retrieved doc specified by doc field. + * Only for inner user. + * + * @param {Object} docTokens a data structure stores which token appears in the retrieved doc. + * @param {String} token query token + * @param {Object} docs the retrieved documents of the query token + * + */ +elasticlunr.Index.prototype.fieldSearchStats = function (docTokens, token, docs) { + for (var doc in docs) { + if (doc in docTokens) { + docTokens[doc].push(token); + } else { + docTokens[doc] = [token]; + } + } +}; + +/** + * coord norm the score of a doc. + * if a doc contain more query tokens, then the score will larger than the doc + * contains less query tokens. + * + * only for inner use. + * + * @param {Object} results first results + * @param {Object} docs field search results of a token + * @param {Integer} n query token number + * @return {Object} + */ +elasticlunr.Index.prototype.coordNorm = function (scores, docTokens, n) { + for (var doc in scores) { + if (!(doc in docTokens)) continue; + var tokens = docTokens[doc].length; + scores[doc] = scores[doc] * tokens / n; + } + + return scores; +}; + +/** + * Returns a representation of the index ready for serialisation. + * + * @return {Object} + * @memberOf Index + */ +elasticlunr.Index.prototype.toJSON = function () { + var indexJson = {}; + this._fields.forEach(function (field) { + indexJson[field] = this.index[field].toJSON(); + }, this); + + return { + version: elasticlunr.version, + fields: this._fields, + ref: this._ref, + documentStore: this.documentStore.toJSON(), + index: indexJson, + pipeline: this.pipeline.toJSON() + }; +}; + +/** + * Applies a plugin to the current index. + * + * A plugin is a function that is called with the index as its context. + * Plugins can be used to customise or extend the behaviour the index + * in some way. A plugin is just a function, that encapsulated the custom + * behaviour that should be applied to the index. + * + * The plugin function will be called with the index as its argument, additional + * arguments can also be passed when calling use. The function will be called + * with the index as its context. + * + * Example: + * + * var myPlugin = function (idx, arg1, arg2) { + * // `this` is the index to be extended + * // apply any extensions etc here. + * } + * + * var idx = elasticlunr(function () { + * this.use(myPlugin, 'arg1', 'arg2') + * }) + * + * @param {Function} plugin The plugin to apply. + * @memberOf Index + */ +elasticlunr.Index.prototype.use = function (plugin) { + var args = Array.prototype.slice.call(arguments, 1); + args.unshift(this); + plugin.apply(this, args); +}; +/*! + * elasticlunr.DocumentStore + * Copyright (C) 2017 Wei Song + */ + +/** + * elasticlunr.DocumentStore is a simple key-value document store used for storing sets of tokens for + * documents stored in index. + * + * elasticlunr.DocumentStore store original JSON format documents that you could build search snippet by this original JSON document. + * + * user could choose whether original JSON format document should be store, if no configuration then document will be stored defaultly. + * If user care more about the index size, user could select not store JSON documents, then this will has some defects, such as user + * could not use JSON document to generate snippets of search results. + * + * @param {Boolean} save If the original JSON document should be stored. + * @constructor + * @module + */ +elasticlunr.DocumentStore = function (save) { + if (save === null || save === undefined) { + this._save = true; + } else { + this._save = save; + } + + this.docs = {}; + this.docInfo = {}; + this.length = 0; +}; + +/** + * Loads a previously serialised document store + * + * @param {Object} serialisedData The serialised document store to load. + * @return {elasticlunr.DocumentStore} + */ +elasticlunr.DocumentStore.load = function (serialisedData) { + var store = new this; + + store.length = serialisedData.length; + store.docs = serialisedData.docs; + store.docInfo = serialisedData.docInfo; + store._save = serialisedData.save; + + return store; +}; + +/** + * check if current instance store the original doc + * + * @return {Boolean} + */ +elasticlunr.DocumentStore.prototype.isDocStored = function () { + return this._save; +}; + +/** + * Stores the given doc in the document store against the given id. + * If docRef already exist, then update doc. + * + * Document is store by original JSON format, then you could use original document to generate search snippets. + * + * @param {Integer|String} docRef The key used to store the JSON format doc. + * @param {Object} doc The JSON format doc. + */ +elasticlunr.DocumentStore.prototype.addDoc = function (docRef, doc) { + if (!this.hasDoc(docRef)) this.length++; + + if (this._save === true) { + this.docs[docRef] = clone(doc); + } else { + this.docs[docRef] = null; + } +}; + +/** + * Retrieves the JSON doc from the document store for a given key. + * + * If docRef not found, return null. + * If user set not storing the documents, return null. + * + * @param {Integer|String} docRef The key to lookup and retrieve from the document store. + * @return {Object} + * @memberOf DocumentStore + */ +elasticlunr.DocumentStore.prototype.getDoc = function (docRef) { + if (this.hasDoc(docRef) === false) return null; + return this.docs[docRef]; +}; + +/** + * Checks whether the document store contains a key (docRef). + * + * @param {Integer|String} docRef The id to look up in the document store. + * @return {Boolean} + * @memberOf DocumentStore + */ +elasticlunr.DocumentStore.prototype.hasDoc = function (docRef) { + return docRef in this.docs; +}; + +/** + * Removes the value for a key in the document store. + * + * @param {Integer|String} docRef The id to remove from the document store. + * @memberOf DocumentStore + */ +elasticlunr.DocumentStore.prototype.removeDoc = function (docRef) { + if (!this.hasDoc(docRef)) return; + + delete this.docs[docRef]; + delete this.docInfo[docRef]; + this.length--; +}; + +/** + * Add field length of a document's field tokens from pipeline results. + * The field length of a document is used to do field length normalization even without the original JSON document stored. + * + * @param {Integer|String} docRef document's id or reference + * @param {String} fieldName field name + * @param {Integer} length field length + */ +elasticlunr.DocumentStore.prototype.addFieldLength = function (docRef, fieldName, length) { + if (docRef === null || docRef === undefined) return; + if (this.hasDoc(docRef) == false) return; + + if (!this.docInfo[docRef]) this.docInfo[docRef] = {}; + this.docInfo[docRef][fieldName] = length; +}; + +/** + * Update field length of a document's field tokens from pipeline results. + * The field length of a document is used to do field length normalization even without the original JSON document stored. + * + * @param {Integer|String} docRef document's id or reference + * @param {String} fieldName field name + * @param {Integer} length field length + */ +elasticlunr.DocumentStore.prototype.updateFieldLength = function (docRef, fieldName, length) { + if (docRef === null || docRef === undefined) return; + if (this.hasDoc(docRef) == false) return; + + this.addFieldLength(docRef, fieldName, length); +}; + +/** + * get field length of a document by docRef + * + * @param {Integer|String} docRef document id or reference + * @param {String} fieldName field name + * @return {Integer} field length + */ +elasticlunr.DocumentStore.prototype.getFieldLength = function (docRef, fieldName) { + if (docRef === null || docRef === undefined) return 0; + + if (!(docRef in this.docs)) return 0; + if (!(fieldName in this.docInfo[docRef])) return 0; + return this.docInfo[docRef][fieldName]; +}; + +/** + * Returns a JSON representation of the document store used for serialisation. + * + * @return {Object} JSON format + * @memberOf DocumentStore + */ +elasticlunr.DocumentStore.prototype.toJSON = function () { + return { + docs: this.docs, + docInfo: this.docInfo, + length: this.length, + save: this._save + }; +}; + +/** + * Cloning object + * + * @param {Object} object in JSON format + * @return {Object} copied object + */ +function clone(obj) { + if (null === obj || "object" !== typeof obj) return obj; + + var copy = obj.constructor(); + + for (var attr in obj) { + if (obj.hasOwnProperty(attr)) copy[attr] = obj[attr]; + } + + return copy; +} +/*! + * elasticlunr.stemmer + * Copyright (C) 2017 Oliver Nightingale + * Copyright (C) 2017 Wei Song + * Includes code from - http://tartarus.org/~martin/PorterStemmer/js.txt + */ + +/** + * elasticlunr.stemmer is an english language stemmer, this is a JavaScript + * implementation of the PorterStemmer taken from http://tartarus.org/~martin + * + * @module + * @param {String} str The string to stem + * @return {String} + * @see elasticlunr.Pipeline + */ +elasticlunr.stemmer = (function(){ + var step2list = { + "ational" : "ate", + "tional" : "tion", + "enci" : "ence", + "anci" : "ance", + "izer" : "ize", + "bli" : "ble", + "alli" : "al", + "entli" : "ent", + "eli" : "e", + "ousli" : "ous", + "ization" : "ize", + "ation" : "ate", + "ator" : "ate", + "alism" : "al", + "iveness" : "ive", + "fulness" : "ful", + "ousness" : "ous", + "aliti" : "al", + "iviti" : "ive", + "biliti" : "ble", + "logi" : "log" + }, + + step3list = { + "icate" : "ic", + "ative" : "", + "alize" : "al", + "iciti" : "ic", + "ical" : "ic", + "ful" : "", + "ness" : "" + }, + + c = "[^aeiou]", // consonant + v = "[aeiouy]", // vowel + C = c + "[^aeiouy]*", // consonant sequence + V = v + "[aeiou]*", // vowel sequence + + mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0 + meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1 + mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1 + s_v = "^(" + C + ")?" + v; // vowel in stem + + var re_mgr0 = new RegExp(mgr0); + var re_mgr1 = new RegExp(mgr1); + var re_meq1 = new RegExp(meq1); + var re_s_v = new RegExp(s_v); + + var re_1a = /^(.+?)(ss|i)es$/; + var re2_1a = /^(.+?)([^s])s$/; + var re_1b = /^(.+?)eed$/; + var re2_1b = /^(.+?)(ed|ing)$/; + var re_1b_2 = /.$/; + var re2_1b_2 = /(at|bl|iz)$/; + var re3_1b_2 = new RegExp("([^aeiouylsz])\\1$"); + var re4_1b_2 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + + var re_1c = /^(.+?[^aeiou])y$/; + var re_2 = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; + + var re_3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; + + var re_4 = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; + var re2_4 = /^(.+?)(s|t)(ion)$/; + + var re_5 = /^(.+?)e$/; + var re_5_1 = /ll$/; + var re3_5 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + + var porterStemmer = function porterStemmer(w) { + var stem, + suffix, + firstch, + re, + re2, + re3, + re4; + + if (w.length < 3) { return w; } + + firstch = w.substr(0,1); + if (firstch == "y") { + w = firstch.toUpperCase() + w.substr(1); + } + + // Step 1a + re = re_1a + re2 = re2_1a; + + if (re.test(w)) { w = w.replace(re,"$1$2"); } + else if (re2.test(w)) { w = w.replace(re2,"$1$2"); } + + // Step 1b + re = re_1b; + re2 = re2_1b; + if (re.test(w)) { + var fp = re.exec(w); + re = re_mgr0; + if (re.test(fp[1])) { + re = re_1b_2; + w = w.replace(re,""); + } + } else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1]; + re2 = re_s_v; + if (re2.test(stem)) { + w = stem; + re2 = re2_1b_2; + re3 = re3_1b_2; + re4 = re4_1b_2; + if (re2.test(w)) { w = w + "e"; } + else if (re3.test(w)) { re = re_1b_2; w = w.replace(re,""); } + else if (re4.test(w)) { w = w + "e"; } + } + } + + // Step 1c - replace suffix y or Y by i if preceded by a non-vowel which is not the first letter of the word (so cry -> cri, by -> by, say -> say) + re = re_1c; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + w = stem + "i"; + } + + // Step 2 + re = re_2; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = re_mgr0; + if (re.test(stem)) { + w = stem + step2list[suffix]; + } + } + + // Step 3 + re = re_3; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = re_mgr0; + if (re.test(stem)) { + w = stem + step3list[suffix]; + } + } + + // Step 4 + re = re_4; + re2 = re2_4; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = re_mgr1; + if (re.test(stem)) { + w = stem; + } + } else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1] + fp[2]; + re2 = re_mgr1; + if (re2.test(stem)) { + w = stem; + } + } + + // Step 5 + re = re_5; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = re_mgr1; + re2 = re_meq1; + re3 = re3_5; + if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) { + w = stem; + } + } + + re = re_5_1; + re2 = re_mgr1; + if (re.test(w) && re2.test(w)) { + re = re_1b_2; + w = w.replace(re,""); + } + + // and turn initial Y back to y + + if (firstch == "y") { + w = firstch.toLowerCase() + w.substr(1); + } + + return w; + }; + + return porterStemmer; +})(); + +elasticlunr.Pipeline.registerFunction(elasticlunr.stemmer, 'stemmer'); +/*! + * elasticlunr.stopWordFilter + * Copyright (C) 2017 Oliver Nightingale + * Copyright (C) 2017 Wei Song + */ + +/** + * elasticlunr.stopWordFilter is an English language stop words filter, any words + * contained in the stop word list will not be passed through the filter. + * + * This is intended to be used in the Pipeline. If the token does not pass the + * filter then undefined will be returned. + * Currently this StopwordFilter using dictionary to do O(1) time complexity stop word filtering. + * + * @module + * @param {String} token The token to pass through the filter + * @return {String} + * @see elasticlunr.Pipeline + */ +elasticlunr.stopWordFilter = function (token) { + if (token && elasticlunr.stopWordFilter.stopWords[token] !== true) { + return token; + } +}; + +/** + * Remove predefined stop words + * if user want to use customized stop words, user could use this function to delete + * all predefined stopwords. + * + * @return {null} + */ +elasticlunr.clearStopWords = function () { + elasticlunr.stopWordFilter.stopWords = {}; +}; + +/** + * Add customized stop words + * user could use this function to add customized stop words + * + * @params {Array} words customized stop words + * @return {null} + */ +elasticlunr.addStopWords = function (words) { + if (words == null || Array.isArray(words) === false) return; + + words.forEach(function (word) { + elasticlunr.stopWordFilter.stopWords[word] = true; + }, this); +}; + +/** + * Reset to default stop words + * user could use this function to restore default stop words + * + * @return {null} + */ +elasticlunr.resetStopWords = function () { + elasticlunr.stopWordFilter.stopWords = elasticlunr.defaultStopWords; +}; + +elasticlunr.defaultStopWords = { + "": true, + "a": true, + "able": true, + "about": true, + "across": true, + "after": true, + "all": true, + "almost": true, + "also": true, + "am": true, + "among": true, + "an": true, + "and": true, + "any": true, + "are": true, + "as": true, + "at": true, + "be": true, + "because": true, + "been": true, + "but": true, + "by": true, + "can": true, + "cannot": true, + "could": true, + "dear": true, + "did": true, + "do": true, + "does": true, + "either": true, + "else": true, + "ever": true, + "every": true, + "for": true, + "from": true, + "get": true, + "got": true, + "had": true, + "has": true, + "have": true, + "he": true, + "her": true, + "hers": true, + "him": true, + "his": true, + "how": true, + "however": true, + "i": true, + "if": true, + "in": true, + "into": true, + "is": true, + "it": true, + "its": true, + "just": true, + "least": true, + "let": true, + "like": true, + "likely": true, + "may": true, + "me": true, + "might": true, + "most": true, + "must": true, + "my": true, + "neither": true, + "no": true, + "nor": true, + "not": true, + "of": true, + "off": true, + "often": true, + "on": true, + "only": true, + "or": true, + "other": true, + "our": true, + "own": true, + "rather": true, + "said": true, + "say": true, + "says": true, + "she": true, + "should": true, + "since": true, + "so": true, + "some": true, + "than": true, + "that": true, + "the": true, + "their": true, + "them": true, + "then": true, + "there": true, + "these": true, + "they": true, + "this": true, + "tis": true, + "to": true, + "too": true, + "twas": true, + "us": true, + "wants": true, + "was": true, + "we": true, + "were": true, + "what": true, + "when": true, + "where": true, + "which": true, + "while": true, + "who": true, + "whom": true, + "why": true, + "will": true, + "with": true, + "would": true, + "yet": true, + "you": true, + "your": true +}; + +elasticlunr.stopWordFilter.stopWords = elasticlunr.defaultStopWords; + +elasticlunr.Pipeline.registerFunction(elasticlunr.stopWordFilter, 'stopWordFilter'); +/*! + * elasticlunr.trimmer + * Copyright (C) 2017 Oliver Nightingale + * Copyright (C) 2017 Wei Song + */ + +/** + * elasticlunr.trimmer is a pipeline function for trimming non word + * characters from the begining and end of tokens before they + * enter the index. + * + * This implementation may not work correctly for non latin + * characters and should either be removed or adapted for use + * with languages with non-latin characters. + * + * @module + * @param {String} token The token to pass through the filter + * @return {String} + * @see elasticlunr.Pipeline + */ +elasticlunr.trimmer = function (token) { + if (token === null || token === undefined) { + throw new Error('token should not be undefined'); + } + + return token + .replace(/^\W+/, '') + .replace(/\W+$/, ''); +}; + +elasticlunr.Pipeline.registerFunction(elasticlunr.trimmer, 'trimmer'); +/*! + * elasticlunr.InvertedIndex + * Copyright (C) 2017 Wei Song + * Includes code from - http://tartarus.org/~martin/PorterStemmer/js.txt + */ + +/** + * elasticlunr.InvertedIndex is used for efficiently storing and + * lookup of documents that contain a given token. + * + * @constructor + */ +elasticlunr.InvertedIndex = function () { + this.root = { docs: {}, df: 0 }; +}; + +/** + * Loads a previously serialised inverted index. + * + * @param {Object} serialisedData The serialised inverted index to load. + * @return {elasticlunr.InvertedIndex} + */ +elasticlunr.InvertedIndex.load = function (serialisedData) { + var idx = new this; + idx.root = serialisedData.root; + + return idx; +}; + +/** + * Adds a {token: tokenInfo} pair to the inverted index. + * If the token already exist, then update the tokenInfo. + * + * tokenInfo format: { ref: 1, tf: 2} + * tokenInfor should contains the document's ref and the tf(token frequency) of that token in + * the document. + * + * By default this function starts at the root of the current inverted index, however + * it can start at any node of the inverted index if required. + * + * @param {String} token + * @param {Object} tokenInfo format: { ref: 1, tf: 2} + * @param {Object} root An optional node at which to start looking for the + * correct place to enter the doc, by default the root of this elasticlunr.InvertedIndex + * is used. + * @memberOf InvertedIndex + */ +elasticlunr.InvertedIndex.prototype.addToken = function (token, tokenInfo, root) { + var root = root || this.root, + idx = 0; + + while (idx <= token.length - 1) { + var key = token[idx]; + + if (!(key in root)) root[key] = {docs: {}, df: 0}; + idx += 1; + root = root[key]; + } + + var docRef = tokenInfo.ref; + if (!root.docs[docRef]) { + // if this doc not exist, then add this doc + root.docs[docRef] = {tf: tokenInfo.tf}; + root.df += 1; + } else { + // if this doc already exist, then update tokenInfo + root.docs[docRef] = {tf: tokenInfo.tf}; + } +}; + +/** + * Checks whether a token is in this elasticlunr.InvertedIndex. + * + * + * @param {String} token The token to be checked + * @return {Boolean} + * @memberOf InvertedIndex + */ +elasticlunr.InvertedIndex.prototype.hasToken = function (token) { + if (!token) return false; + + var node = this.root; + + for (var i = 0; i < token.length; i++) { + if (!node[token[i]]) return false; + node = node[token[i]]; + } + + return true; +}; + +/** + * Retrieve a node from the inverted index for a given token. + * If token not found in this InvertedIndex, return null. + * + * + * @param {String} token The token to get the node for. + * @return {Object} + * @see InvertedIndex.prototype.get + * @memberOf InvertedIndex + */ +elasticlunr.InvertedIndex.prototype.getNode = function (token) { + if (!token) return null; + + var node = this.root; + + for (var i = 0; i < token.length; i++) { + if (!node[token[i]]) return null; + node = node[token[i]]; + } + + return node; +}; + +/** + * Retrieve the documents of a given token. + * If token not found, return {}. + * + * + * @param {String} token The token to get the documents for. + * @return {Object} + * @memberOf InvertedIndex + */ +elasticlunr.InvertedIndex.prototype.getDocs = function (token) { + var node = this.getNode(token); + if (node == null) { + return {}; + } + + return node.docs; +}; + +/** + * Retrieve term frequency of given token in given docRef. + * If token or docRef not found, return 0. + * + * + * @param {String} token The token to get the documents for. + * @param {String|Integer} docRef + * @return {Integer} + * @memberOf InvertedIndex + */ +elasticlunr.InvertedIndex.prototype.getTermFrequency = function (token, docRef) { + var node = this.getNode(token); + + if (node == null) { + return 0; + } + + if (!(docRef in node.docs)) { + return 0; + } + + return node.docs[docRef].tf; +}; + +/** + * Retrieve the document frequency of given token. + * If token not found, return 0. + * + * + * @param {String} token The token to get the documents for. + * @return {Object} + * @memberOf InvertedIndex + */ +elasticlunr.InvertedIndex.prototype.getDocFreq = function (token) { + var node = this.getNode(token); + + if (node == null) { + return 0; + } + + return node.df; +}; + +/** + * Remove the document identified by document's ref from the token in the inverted index. + * + * + * @param {String} token Remove the document from which token. + * @param {String} ref The ref of the document to remove from given token. + * @memberOf InvertedIndex + */ +elasticlunr.InvertedIndex.prototype.removeToken = function (token, ref) { + if (!token) return; + var node = this.getNode(token); + + if (node == null) return; + + if (ref in node.docs) { + delete node.docs[ref]; + node.df -= 1; + } +}; + +/** + * Find all the possible suffixes of given token using tokens currently in the inverted index. + * If token not found, return empty Array. + * + * @param {String} token The token to expand. + * @return {Array} + * @memberOf InvertedIndex + */ +elasticlunr.InvertedIndex.prototype.expandToken = function (token, memo, root) { + if (token == null || token == '') return []; + var memo = memo || []; + + if (root == void 0) { + root = this.getNode(token); + if (root == null) return memo; + } + + if (root.df > 0) memo.push(token); + + for (var key in root) { + if (key === 'docs') continue; + if (key === 'df') continue; + this.expandToken(token + key, memo, root[key]); + } + + return memo; +}; + +/** + * Returns a representation of the inverted index ready for serialisation. + * + * @return {Object} + * @memberOf InvertedIndex + */ +elasticlunr.InvertedIndex.prototype.toJSON = function () { + return { + root: this.root + }; +}; + +/*! + * elasticlunr.Configuration + * Copyright (C) 2017 Wei Song + */ + + /** + * elasticlunr.Configuration is used to analyze the user search configuration. + * + * By elasticlunr.Configuration user could set query-time boosting, boolean model in each field. + * + * Currently configuration supports: + * 1. query-time boosting, user could set how to boost each field. + * 2. boolean model chosing, user could choose which boolean model to use for each field. + * 3. token expandation, user could set token expand to True to improve Recall. Default is False. + * + * Query time boosting must be configured by field category, "boolean" model could be configured + * by both field category or globally as the following example. Field configuration for "boolean" + * will overwrite global configuration. + * Token expand could be configured both by field category or golbally. Local field configuration will + * overwrite global configuration. + * + * configuration example: + * { + * fields:{ + * title: {boost: 2}, + * body: {boost: 1} + * }, + * bool: "OR" + * } + * + * "bool" field configuation overwrite global configuation example: + * { + * fields:{ + * title: {boost: 2, bool: "AND"}, + * body: {boost: 1} + * }, + * bool: "OR" + * } + * + * "expand" example: + * { + * fields:{ + * title: {boost: 2, bool: "AND"}, + * body: {boost: 1} + * }, + * bool: "OR", + * expand: true + * } + * + * "expand" example for field category: + * { + * fields:{ + * title: {boost: 2, bool: "AND", expand: true}, + * body: {boost: 1} + * }, + * bool: "OR" + * } + * + * setting the boost to 0 ignores the field (this will only search the title): + * { + * fields:{ + * title: {boost: 1}, + * body: {boost: 0} + * } + * } + * + * then, user could search with configuration to do query-time boosting. + * idx.search('oracle database', {fields: {title: {boost: 2}, body: {boost: 1}}}); + * + * + * @constructor + * + * @param {String} config user configuration + * @param {Array} fields fields of index instance + * @module + */ +elasticlunr.Configuration = function (config, fields) { + var config = config || ''; + + if (fields == undefined || fields == null) { + throw new Error('fields should not be null'); + } + + this.config = {}; + + var userConfig; + try { + userConfig = JSON.parse(config); + this.buildUserConfig(userConfig, fields); + } catch (error) { + elasticlunr.utils.warn('user configuration parse failed, will use default configuration'); + this.buildDefaultConfig(fields); + } +}; + +/** + * Build default search configuration. + * + * @param {Array} fields fields of index instance + */ +elasticlunr.Configuration.prototype.buildDefaultConfig = function (fields) { + this.reset(); + fields.forEach(function (field) { + this.config[field] = { + boost: 1, + bool: "OR", + expand: false + }; + }, this); +}; + +/** + * Build user configuration. + * + * @param {JSON} config User JSON configuratoin + * @param {Array} fields fields of index instance + */ +elasticlunr.Configuration.prototype.buildUserConfig = function (config, fields) { + var global_bool = "OR"; + var global_expand = false; + + this.reset(); + if ('bool' in config) { + global_bool = config['bool'] || global_bool; + } + + if ('expand' in config) { + global_expand = config['expand'] || global_expand; + } + + if ('fields' in config) { + for (var field in config['fields']) { + if (fields.indexOf(field) > -1) { + var field_config = config['fields'][field]; + var field_expand = global_expand; + if (field_config.expand != undefined) { + field_expand = field_config.expand; + } + + this.config[field] = { + boost: (field_config.boost || field_config.boost === 0) ? field_config.boost : 1, + bool: field_config.bool || global_bool, + expand: field_expand + }; + } else { + elasticlunr.utils.warn('field name in user configuration not found in index instance fields'); + } + } + } else { + this.addAllFields2UserConfig(global_bool, global_expand, fields); + } +}; + +/** + * Add all fields to user search configuration. + * + * @param {String} bool Boolean model + * @param {String} expand Expand model + * @param {Array} fields fields of index instance + */ +elasticlunr.Configuration.prototype.addAllFields2UserConfig = function (bool, expand, fields) { + fields.forEach(function (field) { + this.config[field] = { + boost: 1, + bool: bool, + expand: expand + }; + }, this); +}; + +/** + * get current user configuration + */ +elasticlunr.Configuration.prototype.get = function () { + return this.config; +}; + +/** + * reset user search configuration. + */ +elasticlunr.Configuration.prototype.reset = function () { + this.config = {}; +}; +/** + * sorted_set.js is added only to make elasticlunr.js compatible with lunr-languages. + * if elasticlunr.js support different languages by default, this will make elasticlunr.js + * much bigger that not good for browser usage. + * + */ + + +/*! + * lunr.SortedSet + * Copyright (C) 2017 Oliver Nightingale + */ + +/** + * lunr.SortedSets are used to maintain an array of uniq values in a sorted + * order. + * + * @constructor + */ +lunr.SortedSet = function () { + this.length = 0 + this.elements = [] +} + +/** + * Loads a previously serialised sorted set. + * + * @param {Array} serialisedData The serialised set to load. + * @returns {lunr.SortedSet} + * @memberOf SortedSet + */ +lunr.SortedSet.load = function (serialisedData) { + var set = new this + + set.elements = serialisedData + set.length = serialisedData.length + + return set +} + +/** + * Inserts new items into the set in the correct position to maintain the + * order. + * + * @param {Object} The objects to add to this set. + * @memberOf SortedSet + */ +lunr.SortedSet.prototype.add = function () { + var i, element + + for (i = 0; i < arguments.length; i++) { + element = arguments[i] + if (~this.indexOf(element)) continue + this.elements.splice(this.locationFor(element), 0, element) + } + + this.length = this.elements.length +} + +/** + * Converts this sorted set into an array. + * + * @returns {Array} + * @memberOf SortedSet + */ +lunr.SortedSet.prototype.toArray = function () { + return this.elements.slice() +} + +/** + * Creates a new array with the results of calling a provided function on every + * element in this sorted set. + * + * Delegates to Array.prototype.map and has the same signature. + * + * @param {Function} fn The function that is called on each element of the + * set. + * @param {Object} ctx An optional object that can be used as the context + * for the function fn. + * @returns {Array} + * @memberOf SortedSet + */ +lunr.SortedSet.prototype.map = function (fn, ctx) { + return this.elements.map(fn, ctx) +} + +/** + * Executes a provided function once per sorted set element. + * + * Delegates to Array.prototype.forEach and has the same signature. + * + * @param {Function} fn The function that is called on each element of the + * set. + * @param {Object} ctx An optional object that can be used as the context + * @memberOf SortedSet + * for the function fn. + */ +lunr.SortedSet.prototype.forEach = function (fn, ctx) { + return this.elements.forEach(fn, ctx) +} + +/** + * Returns the index at which a given element can be found in the + * sorted set, or -1 if it is not present. + * + * @param {Object} elem The object to locate in the sorted set. + * @returns {Number} + * @memberOf SortedSet + */ +lunr.SortedSet.prototype.indexOf = function (elem) { + var start = 0, + end = this.elements.length, + sectionLength = end - start, + pivot = start + Math.floor(sectionLength / 2), + pivotElem = this.elements[pivot] + + while (sectionLength > 1) { + if (pivotElem === elem) return pivot + + if (pivotElem < elem) start = pivot + if (pivotElem > elem) end = pivot + + sectionLength = end - start + pivot = start + Math.floor(sectionLength / 2) + pivotElem = this.elements[pivot] + } + + if (pivotElem === elem) return pivot + + return -1 +} + +/** + * Returns the position within the sorted set that an element should be + * inserted at to maintain the current order of the set. + * + * This function assumes that the element to search for does not already exist + * in the sorted set. + * + * @param {Object} elem The elem to find the position for in the set + * @returns {Number} + * @memberOf SortedSet + */ +lunr.SortedSet.prototype.locationFor = function (elem) { + var start = 0, + end = this.elements.length, + sectionLength = end - start, + pivot = start + Math.floor(sectionLength / 2), + pivotElem = this.elements[pivot] + + while (sectionLength > 1) { + if (pivotElem < elem) start = pivot + if (pivotElem > elem) end = pivot + + sectionLength = end - start + pivot = start + Math.floor(sectionLength / 2) + pivotElem = this.elements[pivot] + } + + if (pivotElem > elem) return pivot + if (pivotElem < elem) return pivot + 1 +} + +/** + * Creates a new lunr.SortedSet that contains the elements in the intersection + * of this set and the passed set. + * + * @param {lunr.SortedSet} otherSet The set to intersect with this set. + * @returns {lunr.SortedSet} + * @memberOf SortedSet + */ +lunr.SortedSet.prototype.intersect = function (otherSet) { + var intersectSet = new lunr.SortedSet, + i = 0, j = 0, + a_len = this.length, b_len = otherSet.length, + a = this.elements, b = otherSet.elements + + while (true) { + if (i > a_len - 1 || j > b_len - 1) break + + if (a[i] === b[j]) { + intersectSet.add(a[i]) + i++, j++ + continue + } + + if (a[i] < b[j]) { + i++ + continue + } + + if (a[i] > b[j]) { + j++ + continue + } + }; + + return intersectSet +} + +/** + * Makes a copy of this set + * + * @returns {lunr.SortedSet} + * @memberOf SortedSet + */ +lunr.SortedSet.prototype.clone = function () { + var clone = new lunr.SortedSet + + clone.elements = this.toArray() + clone.length = clone.elements.length + + return clone +} + +/** + * Creates a new lunr.SortedSet that contains the elements in the union + * of this set and the passed set. + * + * @param {lunr.SortedSet} otherSet The set to union with this set. + * @returns {lunr.SortedSet} + * @memberOf SortedSet + */ +lunr.SortedSet.prototype.union = function (otherSet) { + var longSet, shortSet, unionSet + + if (this.length >= otherSet.length) { + longSet = this, shortSet = otherSet + } else { + longSet = otherSet, shortSet = this + } + + unionSet = longSet.clone() + + for(var i = 0, shortSetElements = shortSet.toArray(); i < shortSetElements.length; i++){ + unionSet.add(shortSetElements[i]) + } + + return unionSet +} + +/** + * Returns a representation of the sorted set ready for serialisation. + * + * @returns {Array} + * @memberOf SortedSet + */ +lunr.SortedSet.prototype.toJSON = function () { + return this.toArray() +} + /** + * export the module via AMD, CommonJS or as a browser global + * Export code from https://github.com/umdjs/umd/blob/master/returnExports.js + */ + ;(function (root, factory) { + if (typeof define === 'function' && define.amd) { + // AMD. Register as an anonymous module. + define(factory) + } else if (typeof exports === 'object') { + /** + * Node. Does not work with strict CommonJS, but + * only CommonJS-like enviroments that support module.exports, + * like Node. + */ + module.exports = factory() + } else { + // Browser globals (root is window) + root.elasticlunr = factory() + } + }(this, function () { + /** + * Just return a value to define the module export. + * This example returns an object, but the module + * can return a function as the exported value. + */ + return elasticlunr + })) +})(); diff --git a/nasg.py b/nasg.py index e294562..6890383 100644 --- a/nasg.py +++ b/nasg.py @@ -14,6 +14,7 @@ import re import imghdr import logging import asyncio +import json from shutil import copy2 as cp from math import ceil from urllib.parse import urlparse @@ -27,10 +28,10 @@ import markdown from feedgen.feed import FeedGenerator from bleach import clean from emoji import UNICODE_EMOJI +from py_mini_racer import py_mini_racer import exiftool import settings -import sys from pprint import pprint MarkdownImage = namedtuple( @@ -65,7 +66,6 @@ MD = markdown.Markdown( ], ) - class MarkdownDoc(object): @property @cached() @@ -443,6 +443,18 @@ class Singular(MarkdownDoc): else: return True + @property + def corpus(self): + return { + 'url': self.url, + 'title': self.title, + 'body': "\n".join([ + self.name, + self.summary, + self.content, + ]) + } + async def render(self): if self.exists: return @@ -1025,6 +1037,40 @@ class Category(dict): self.ping_websub() +class Search(object): + def __init__(self): + self.js = py_mini_racer.MiniRacer() + with open('elasticlunr.js') as f: + self.js.eval(f.read()) + + self.js.eval(""" + var index = elasticlunr(); + index.addField('title'); + index.addField('body'); + index.setRef('url'); + + """) + # index.saveDocument(false); + + @property + def fpath(self): + return os.path.join( + settings.paths.get('build'), + 'search.json' + ) + + def add(self, data): + self.js.eval(""" + index.addDoc(%s); + """ % ( + json.dumps(data) + )) + + def save(self): + with open(self.fpath, 'wt') as f: + f.write(json.dumps(self.js.eval("index.toJSON()"))) + + def make(): start = int(round(time.time() * 1000)) content = settings.paths.get('content') @@ -1044,6 +1090,8 @@ def make(): categories = {} categories['/'] = Category() sitemap = OrderedDict() + search = Search() + for e in sorted(glob.glob(os.path.join(content, '*', '*', 'index.md'))): post = Singular(e) if post.category not in categories: @@ -1056,6 +1104,7 @@ def make(): for i in post.images.values(): worker.append(i.downsize()) worker.append(post.render()) + search.add(post.corpus) sitemap[post.url] = post.mtime for category in categories.values(): @@ -1079,6 +1128,9 @@ def make(): with open(t, 'wt') as f: f.write("\n".join(sorted(sitemap.keys()))) + # dump search index + search.save() + end = int(round(time.time() * 1000)) logging.info('process took %d ms' % (end - start))