diff --git a/plug-api/lib/query.ts b/plug-api/lib/query.ts index 0110c1f7..34b92a13 100644 --- a/plug-api/lib/query.ts +++ b/plug-api/lib/query.ts @@ -215,6 +215,20 @@ export function applyQueryNoFilterKV( allItems[i].value = newRec; } } + if (query.distinct) { + // Remove duplicates + const valueSet = new Set(); + const uniqueItems: KV[] = []; + for (const item of allItems) { + const value = JSON.stringify(item.value); + if (!valueSet.has(value)) { + valueSet.add(value); + uniqueItems.push(item); + } + } + allItems = uniqueItems; + } + if (query.limit) { const limit = evalQueryExpression(query.limit, {}, functionMap); if (allItems.length > limit) { diff --git a/plug-api/types.ts b/plug-api/types.ts index 6f30c6ef..b24044e5 100644 --- a/plug-api/types.ts +++ b/plug-api/types.ts @@ -66,6 +66,7 @@ export type Query = { select?: Select[]; limit?: QueryExpression; render?: string; + distinct?: boolean; }; export type KvQuery = Omit & { diff --git a/plugs/index/api.ts b/plugs/index/api.ts index 744f6c98..07a7932e 100644 --- a/plugs/index/api.ts +++ b/plugs/index/api.ts @@ -1,5 +1,5 @@ import { datastore } from "$sb/syscalls.ts"; -import { KV, KvKey, ObjectQuery, ObjectValue } from "$sb/types.ts"; +import { KV, KvKey, KvQuery, ObjectQuery, ObjectValue } from "$sb/types.ts"; import { QueryProviderEvent } from "$sb/app_event.ts"; import { builtins } from "./builtins.ts"; import { AttributeObject, determineType } from "./attributes.ts"; @@ -126,16 +126,25 @@ export async function queryObjects( return (await datastore.query({ ...query, prefix: [indexKey, tag], + distinct: true, })).map(({ value }) => value); } -export async function getObjectByRef( +export async function query( + query: KvQuery, +): Promise { + return (await datastore.query({ + ...query, + prefix: [indexKey, ...query.prefix ? query.prefix : []], + })).map(({ key, value }) => ({ key: key.slice(1), value })); +} + +export function getObjectByRef( page: string, tag: string, ref: string, ): Promise | undefined> { - console.log("Fetching!!!!!", [indexKey, tag, cleanKey(ref, page), page]); - return (await datastore.get([indexKey, tag, cleanKey(ref, page), page])); + return datastore.get([indexKey, tag, cleanKey(ref, page), page]); } export async function objectSourceProvider({ @@ -145,6 +154,7 @@ export async function objectSourceProvider({ const results = await datastore.query({ ...query, prefix: [indexKey, tag], + distinct: true, }); return results.map((r) => r.value); } diff --git a/plugs/index/command.ts b/plugs/index/command.ts index 3dd45245..a5f73250 100644 --- a/plugs/index/command.ts +++ b/plugs/index/command.ts @@ -12,7 +12,6 @@ export async function reindexCommand() { export async function reindexSpace() { console.log("Clearing page index..."); // Executed this way to not have to embed the search plug code here - await system.invokeFunction("search.clearIndex"); await system.invokeFunction("index.clearIndex"); const pages = await space.listPages(); diff --git a/plugs/index/index.plug.yaml b/plugs/index/index.plug.yaml index 4b221d64..5e7b3352 100644 --- a/plugs/index/index.plug.yaml +++ b/plugs/index/index.plug.yaml @@ -23,6 +23,9 @@ functions: batchSet: path: api.ts:batchSet env: server + query: + path: api.ts:query + env: server indexObjects: path: api.ts:indexObjects env: server diff --git a/plugs/index/plug_api.ts b/plugs/index/plug_api.ts index 4789d761..27f6d2cd 100644 --- a/plugs/index/plug_api.ts +++ b/plugs/index/plug_api.ts @@ -1,4 +1,4 @@ -import { ObjectQuery, ObjectValue } from "$sb/types.ts"; +import { KV, KvQuery, ObjectQuery, ObjectValue } from "$sb/types.ts"; import { invokeFunction } from "$sb/silverbullet-syscall/system.ts"; export function indexObjects( @@ -8,6 +8,16 @@ export function indexObjects( return invokeFunction("index.indexObjects", page, objects); } +export function batchSet(page: string, kvs: KV[]): Promise { + return invokeFunction("index.batchSet", page, kvs); +} + +export function query( + query: KvQuery, +): Promise { + return invokeFunction("index.query", query); +} + export function queryObjects( tag: string, query: ObjectQuery, diff --git a/plugs/search/engine.test.ts b/plugs/search/engine.test.ts deleted file mode 100644 index 5c494bd1..00000000 --- a/plugs/search/engine.test.ts +++ /dev/null @@ -1,72 +0,0 @@ -import { KV, KvKey } from "$sb/types.ts"; -import { assertEquals } from "../../test_deps.ts"; -import { BatchKVStore, SimpleSearchEngine } from "./engine.ts"; - -class InMemoryBatchKVStore implements BatchKVStore { - private store = new Map(); - - query({ prefix }: { prefix: KvKey }): Promise { - const results: KV[] = []; - entries: - for (const [key, value] of this.store.entries()) { - const parsedKey: string[] = JSON.parse(key); - for (let i = 0; i < prefix.length; i++) { - if (prefix[i] !== parsedKey[i]) { - continue entries; - } - } - results.push({ key: parsedKey, value }); - } - return Promise.resolve(results); - } - - batchSet(kvs: KV[]): Promise { - for (const { key, value } of kvs) { - this.store.set(JSON.stringify(key), value); - } - return Promise.resolve(); - } - - batchDel(keys: KvKey[]): Promise { - for (const key of keys) { - this.store.delete(JSON.stringify(key)); - } - return Promise.resolve(); - } -} - -Deno.test("Test full text search", async () => { - const engine = new SimpleSearchEngine(new InMemoryBatchKVStore()); - - await engine.indexDocument({ id: "1", text: "The quick brown fox" }); - await engine.indexDocument({ id: "2", text: "jumps over the lazy dogs" }); - await engine.indexDocument({ - id: "3", - text: "Hello world, jumping jump jumps", - }); - await engine.indexDocument({ id: "4", text: "TypeScript is awesome" }); - await engine.indexDocument({ id: "5", text: "The brown dogs jumps zęf" }); - - console.log(engine.index); - - const results = await engine.search("Brown fox"); - console.log(results); - assertEquals(results.length, 2); - assertEquals(results[0].id, "1"); - assertEquals(results[0].score, 2); - assertEquals(results[1].id, "5"); - assertEquals(results[1].score, 1); - - const results2 = await engine.search("jump"); - console.log(results2); - assertEquals(results2.length, 3); - - await engine.deleteDocument("3"); - const results3 = await engine.search("jump"); - console.log(results3); - assertEquals(results3.length, 2); - - const results4 = await engine.search("zęf"); - console.log(results4); - assertEquals(results4.length, 1); -}); diff --git a/plugs/search/engine.ts b/plugs/search/engine.ts index 5c294bc4..03faddf2 100644 --- a/plugs/search/engine.ts +++ b/plugs/search/engine.ts @@ -1,124 +1,88 @@ import { stemmer } from "https://esm.sh/porter-stemmer@0.9.1"; -import { KV, KvKey } from "$sb/types.ts"; - -export type Document = { - id: string; - text: string; -}; - -export interface BatchKVStore { - batchSet(kvs: KV[]): Promise; - batchDel(keys: KvKey[]): Promise; - query(options: { prefix: KvKey }): Promise; -} +import { batchSet, query } from "../index/plug_api.ts"; type ResultObject = { score: number; id: string; }; -export class SimpleSearchEngine { - private stopWords = ["and", "or", "the", "a", "an"]; +const stopWords = ["and", "or", "the", "a", "an"]; - constructor( - public index: BatchKVStore, - // public reverseIndex: BatchKVStore, - ) { +// Tokenize text into words +function tokenize(text: string): string[] { + return text.toLowerCase().split(/[^\p{L}]+/u); +} + +// Remove stop words from array of words +function removeStopWords(words: string[]): string[] { + return words.filter((word) => + word.length > 2 && + !stopWords.includes(word) && /^\p{L}+$/u.test(word) + ); +} + +// Basic stemming function +function stem(word: string): string { + return stemmer(word); +} + +// Index an array of documents +export async function ftsIndexPage( + pageName: string, + text: string, +): Promise { + const updateIndexMap = new Map(); // word!id -> count + + const pageNameTokens = tokenize(pageName); + const pageContentTokens = tokenize(text); + const words = [...pageNameTokens, ...pageContentTokens]; + const filteredWords = removeStopWords(words); + const stemmedWords = filteredWords.map(stem); + + // Get the current IDs for these stemmed words + // const uniqueStemmedWords = [...new Set(stemmedWords)]; + + for (const stemmedWord of stemmedWords) { + const currentFreq = updateIndexMap.get(stemmedWord) || 0; + updateIndexMap.set(stemmedWord, currentFreq + 1); } - // Tokenize text into words - private tokenize(text: string): string[] { - return text.toLowerCase().split(/[^\p{L}]+/u); - } + // console.log("updateIndexMap", updateIndexMap); - // Remove stop words from array of words - private removeStopWords(words: string[]): string[] { - return words.filter((word) => - word.length > 2 && - !this.stopWords.includes(word) && /^\p{L}+$/u.test(word) - ); - } + await batchSet( + pageName, + [...updateIndexMap.entries()].map(( + [key, value], + ) => ({ key: ["fts", key], value })), + ); +} - // Basic stemming function - private stem(word: string): string { - return stemmer(word); - } +// Search for a phrase and return document ids sorted by match count +export async function ftsSearch(phrase: string): Promise { + const words = tokenize(phrase); + const filteredWords = removeStopWords(words); + const stemmedWords = filteredWords.map((word) => stem(word)); - // Index an array of documents - public async indexDocument(document: Document): Promise { - const updateIndexMap = new Map(); // word!id -> count - const updateReverseIndexMap = new Map(); // id!word -> true + // const wordIdsArray: string[][] = await this.index.get(stemmedWords); + const matchCounts: Map = new Map(); // pageName -> count - const pageContent = this.tokenize(document.text); - const pageName = this.tokenize(document.id); - const words = [...pageContent, ...pageName]; - const filteredWords = this.removeStopWords(words); - const stemmedWords = filteredWords.map((word) => this.stem(word)); - - // Get the current IDs for these stemmed words - // const uniqueStemmedWords = [...new Set(stemmedWords)]; - - for (const stemmedWord of stemmedWords) { - const key = `${stemmedWord}!${document.id}`; - const revKey = `${document.id}!${stemmedWord}`; - const currentFreq = updateIndexMap.get(key) || 0; - updateIndexMap.set(key, currentFreq + 1); - updateReverseIndexMap.set(revKey, true); - } - - // console.log("updateIndexMap", updateIndexMap); - - await this.index.batchSet( - [...updateIndexMap.entries()].map(( - [key, value], - ) => ({ key: ["fts", ...key.split("!")], value: value })), - ); - await this.index.batchSet( - [...updateReverseIndexMap.entries()].map(( - [key, value], - ) => ({ key: ["fts_rev", ...key.split("!")], value: value })), - ); - } - - // Search for a phrase and return document ids sorted by match count - public async search(phrase: string): Promise { - const words = this.tokenize(phrase); - const filteredWords = this.removeStopWords(words); - const stemmedWords = filteredWords.map((word) => this.stem(word)); - - // const wordIdsArray: string[][] = await this.index.get(stemmedWords); - const matchCounts: Map = new Map(); // pageName -> count - - for (const stemmedWord of stemmedWords) { - const entries = await this.index.query({ prefix: ["fts", stemmedWord] }); - for (const { key, value } of entries) { - const id = key[2]; - if (matchCounts.has(id)) { - matchCounts.set(id, matchCounts.get(id)! + value); - } else { - matchCounts.set(id, value); - } + for (const stemmedWord of stemmedWords) { + const entries = await query({ + prefix: ["fts", stemmedWord], + }); + for (const { key, value } of entries) { + const id = key[2]; + if (matchCounts.has(id)) { + matchCounts.set(id, matchCounts.get(id)! + value); + } else { + matchCounts.set(id, value); } } - - const results = Array.from(matchCounts.entries()).map( - ([id, score]) => ({ id, score }), - ); - - return results.sort((a, b) => b.score - a.score); } - // Delete a document from the index - public async deleteDocument(documentId: string): Promise { - const words = await this.index.query({ - prefix: ["fts_rev", documentId], - }); - const keysToDelete: KvKey[] = []; - for (const { key } of words) { - const word = key[2]; - keysToDelete.push(["fts", word, documentId]); - keysToDelete.push(key); - } - await this.index.batchDel(keysToDelete); - } + const results = Array.from(matchCounts.entries()).map( + ([id, score]) => ({ id, score }), + ); + + return results.sort((a, b) => b.score - a.score); } diff --git a/plugs/search/search.plug.yaml b/plugs/search/search.plug.yaml index 50fd59dd..20c1c1d1 100644 --- a/plugs/search/search.plug.yaml +++ b/plugs/search/search.plug.yaml @@ -5,14 +5,6 @@ functions: events: - page:index - clearIndex: - path: search.ts:clearIndex - - searchUnindex: - path: "./search.ts:pageUnindex" - env: client - events: - - page:deleted searchQueryProvider: path: ./search.ts:queryProvider events: diff --git a/plugs/search/search.ts b/plugs/search/search.ts index c4bff50b..7aeff23d 100644 --- a/plugs/search/search.ts +++ b/plugs/search/search.ts @@ -5,15 +5,13 @@ import { evalQueryExpression, liftAttributeFilter, } from "$sb/lib/query.ts"; -import { datastore, editor } from "$sb/syscalls.ts"; -import { SimpleSearchEngine } from "./engine.ts"; -import { FileMeta, KvKey } from "$sb/types.ts"; +import { editor } from "$sb/syscalls.ts"; +import { FileMeta } from "$sb/types.ts"; import { PromiseQueue } from "$sb/lib/async.ts"; +import { ftsIndexPage, ftsSearch } from "./engine.ts"; const searchPrefix = "🔍 "; -const engine = new SimpleSearchEngine(datastore); - // Search indexing is prone to concurrency issues, so we queue all write operations const promiseQueue = new PromiseQueue(); @@ -21,25 +19,8 @@ export function indexPage({ name, tree }: IndexTreeEvent) { const text = renderToText(tree); return promiseQueue.runInQueue(async () => { // console.log("Now FTS indexing", name); - await engine.deleteDocument(name); - await engine.indexDocument({ id: name, text }); - }); -} - -export async function clearIndex() { - const keysToDelete: KvKey[] = []; - for (const { key } of await datastore.query({ prefix: ["fts"] })) { - keysToDelete.push(key); - } - for (const { key } of await datastore.query({ prefix: ["fts_rev"] })) { - keysToDelete.push(key); - } - await datastore.batchDel(keysToDelete); -} - -export function pageUnindex(pageName: string) { - return promiseQueue.runInQueue(() => { - return engine.deleteDocument(pageName); + // await engine.deleteDocument(name); + await ftsIndexPage(name, text); }); } @@ -52,7 +33,7 @@ export async function queryProvider({ } const phrase = evalQueryExpression(phraseFilter, {}); // console.log("Phrase", phrase); - let results: any[] = await engine.search(phrase); + let results: any[] = await ftsSearch(phrase); // Patch the object to a format that users expect (translate id to name) for (const r of results) { @@ -78,7 +59,7 @@ export async function readFileSearch( searchPrefix.length, name.length - ".md".length, ); - const results = await engine.search(phrase); + const results = await ftsSearch(phrase); const text = `# Search results for "${phrase}"\n${ results .map((r) => `* [[${r.id}]] (score ${r.score})`)