FTS simplifications

pull/528/head
Zef Hemel 2023-10-03 15:24:07 +02:00
parent 0b4f938c5d
commit d0bc1bc722
10 changed files with 119 additions and 217 deletions

View File

@ -215,6 +215,20 @@ export function applyQueryNoFilterKV(
allItems[i].value = newRec;
}
}
if (query.distinct) {
// Remove duplicates
const valueSet = new Set<string>();
const uniqueItems: KV[] = [];
for (const item of allItems) {
const value = JSON.stringify(item.value);
if (!valueSet.has(value)) {
valueSet.add(value);
uniqueItems.push(item);
}
}
allItems = uniqueItems;
}
if (query.limit) {
const limit = evalQueryExpression(query.limit, {}, functionMap);
if (allItems.length > limit) {

View File

@ -66,6 +66,7 @@ export type Query = {
select?: Select[];
limit?: QueryExpression;
render?: string;
distinct?: boolean;
};
export type KvQuery = Omit<Query, "querySource"> & {

View File

@ -1,5 +1,5 @@
import { datastore } from "$sb/syscalls.ts";
import { KV, KvKey, ObjectQuery, ObjectValue } from "$sb/types.ts";
import { KV, KvKey, KvQuery, ObjectQuery, ObjectValue } from "$sb/types.ts";
import { QueryProviderEvent } from "$sb/app_event.ts";
import { builtins } from "./builtins.ts";
import { AttributeObject, determineType } from "./attributes.ts";
@ -126,16 +126,25 @@ export async function queryObjects<T>(
return (await datastore.query({
...query,
prefix: [indexKey, tag],
distinct: true,
})).map(({ value }) => value);
}
export async function getObjectByRef<T>(
export async function query(
query: KvQuery,
): Promise<KV[]> {
return (await datastore.query({
...query,
prefix: [indexKey, ...query.prefix ? query.prefix : []],
})).map(({ key, value }) => ({ key: key.slice(1), value }));
}
export function getObjectByRef<T>(
page: string,
tag: string,
ref: string,
): Promise<ObjectValue<T> | undefined> {
console.log("Fetching!!!!!", [indexKey, tag, cleanKey(ref, page), page]);
return (await datastore.get([indexKey, tag, cleanKey(ref, page), page]));
return datastore.get([indexKey, tag, cleanKey(ref, page), page]);
}
export async function objectSourceProvider({
@ -145,6 +154,7 @@ export async function objectSourceProvider({
const results = await datastore.query({
...query,
prefix: [indexKey, tag],
distinct: true,
});
return results.map((r) => r.value);
}

View File

@ -12,7 +12,6 @@ export async function reindexCommand() {
export async function reindexSpace() {
console.log("Clearing page index...");
// Executed this way to not have to embed the search plug code here
await system.invokeFunction("search.clearIndex");
await system.invokeFunction("index.clearIndex");
const pages = await space.listPages();

View File

@ -23,6 +23,9 @@ functions:
batchSet:
path: api.ts:batchSet
env: server
query:
path: api.ts:query
env: server
indexObjects:
path: api.ts:indexObjects
env: server

View File

@ -1,4 +1,4 @@
import { ObjectQuery, ObjectValue } from "$sb/types.ts";
import { KV, KvQuery, ObjectQuery, ObjectValue } from "$sb/types.ts";
import { invokeFunction } from "$sb/silverbullet-syscall/system.ts";
export function indexObjects<T>(
@ -8,6 +8,16 @@ export function indexObjects<T>(
return invokeFunction("index.indexObjects", page, objects);
}
export function batchSet(page: string, kvs: KV[]): Promise<void> {
return invokeFunction("index.batchSet", page, kvs);
}
export function query(
query: KvQuery,
): Promise<KV[]> {
return invokeFunction("index.query", query);
}
export function queryObjects<T>(
tag: string,
query: ObjectQuery,

View File

@ -1,72 +0,0 @@
import { KV, KvKey } from "$sb/types.ts";
import { assertEquals } from "../../test_deps.ts";
import { BatchKVStore, SimpleSearchEngine } from "./engine.ts";
class InMemoryBatchKVStore implements BatchKVStore {
private store = new Map<string, any>();
query({ prefix }: { prefix: KvKey }): Promise<KV[]> {
const results: KV[] = [];
entries:
for (const [key, value] of this.store.entries()) {
const parsedKey: string[] = JSON.parse(key);
for (let i = 0; i < prefix.length; i++) {
if (prefix[i] !== parsedKey[i]) {
continue entries;
}
}
results.push({ key: parsedKey, value });
}
return Promise.resolve(results);
}
batchSet(kvs: KV[]): Promise<void> {
for (const { key, value } of kvs) {
this.store.set(JSON.stringify(key), value);
}
return Promise.resolve();
}
batchDel(keys: KvKey[]): Promise<void> {
for (const key of keys) {
this.store.delete(JSON.stringify(key));
}
return Promise.resolve();
}
}
Deno.test("Test full text search", async () => {
const engine = new SimpleSearchEngine(new InMemoryBatchKVStore());
await engine.indexDocument({ id: "1", text: "The quick brown fox" });
await engine.indexDocument({ id: "2", text: "jumps over the lazy dogs" });
await engine.indexDocument({
id: "3",
text: "Hello world, jumping jump jumps",
});
await engine.indexDocument({ id: "4", text: "TypeScript is awesome" });
await engine.indexDocument({ id: "5", text: "The brown dogs jumps zęf" });
console.log(engine.index);
const results = await engine.search("Brown fox");
console.log(results);
assertEquals(results.length, 2);
assertEquals(results[0].id, "1");
assertEquals(results[0].score, 2);
assertEquals(results[1].id, "5");
assertEquals(results[1].score, 1);
const results2 = await engine.search("jump");
console.log(results2);
assertEquals(results2.length, 3);
await engine.deleteDocument("3");
const results3 = await engine.search("jump");
console.log(results3);
assertEquals(results3.length, 2);
const results4 = await engine.search("zęf");
console.log(results4);
assertEquals(results4.length, 1);
});

View File

@ -1,124 +1,88 @@
import { stemmer } from "https://esm.sh/porter-stemmer@0.9.1";
import { KV, KvKey } from "$sb/types.ts";
export type Document = {
id: string;
text: string;
};
export interface BatchKVStore {
batchSet(kvs: KV[]): Promise<void>;
batchDel(keys: KvKey[]): Promise<void>;
query(options: { prefix: KvKey }): Promise<KV[]>;
}
import { batchSet, query } from "../index/plug_api.ts";
type ResultObject = {
score: number;
id: string;
};
export class SimpleSearchEngine {
private stopWords = ["and", "or", "the", "a", "an"];
const stopWords = ["and", "or", "the", "a", "an"];
constructor(
public index: BatchKVStore,
// public reverseIndex: BatchKVStore,
) {
// Tokenize text into words
function tokenize(text: string): string[] {
return text.toLowerCase().split(/[^\p{L}]+/u);
}
// Remove stop words from array of words
function removeStopWords(words: string[]): string[] {
return words.filter((word) =>
word.length > 2 &&
!stopWords.includes(word) && /^\p{L}+$/u.test(word)
);
}
// Basic stemming function
function stem(word: string): string {
return stemmer(word);
}
// Index an array of documents
export async function ftsIndexPage(
pageName: string,
text: string,
): Promise<void> {
const updateIndexMap = new Map<string, number>(); // word!id -> count
const pageNameTokens = tokenize(pageName);
const pageContentTokens = tokenize(text);
const words = [...pageNameTokens, ...pageContentTokens];
const filteredWords = removeStopWords(words);
const stemmedWords = filteredWords.map(stem);
// Get the current IDs for these stemmed words
// const uniqueStemmedWords = [...new Set(stemmedWords)];
for (const stemmedWord of stemmedWords) {
const currentFreq = updateIndexMap.get(stemmedWord) || 0;
updateIndexMap.set(stemmedWord, currentFreq + 1);
}
// Tokenize text into words
private tokenize(text: string): string[] {
return text.toLowerCase().split(/[^\p{L}]+/u);
}
// console.log("updateIndexMap", updateIndexMap);
// Remove stop words from array of words
private removeStopWords(words: string[]): string[] {
return words.filter((word) =>
word.length > 2 &&
!this.stopWords.includes(word) && /^\p{L}+$/u.test(word)
);
}
await batchSet(
pageName,
[...updateIndexMap.entries()].map((
[key, value],
) => ({ key: ["fts", key], value })),
);
}
// Basic stemming function
private stem(word: string): string {
return stemmer(word);
}
// Search for a phrase and return document ids sorted by match count
export async function ftsSearch(phrase: string): Promise<ResultObject[]> {
const words = tokenize(phrase);
const filteredWords = removeStopWords(words);
const stemmedWords = filteredWords.map((word) => stem(word));
// Index an array of documents
public async indexDocument(document: Document): Promise<void> {
const updateIndexMap = new Map<string, number>(); // word!id -> count
const updateReverseIndexMap = new Map<string, boolean>(); // id!word -> true
// const wordIdsArray: string[][] = await this.index.get(stemmedWords);
const matchCounts: Map<string, number> = new Map(); // pageName -> count
const pageContent = this.tokenize(document.text);
const pageName = this.tokenize(document.id);
const words = [...pageContent, ...pageName];
const filteredWords = this.removeStopWords(words);
const stemmedWords = filteredWords.map((word) => this.stem(word));
// Get the current IDs for these stemmed words
// const uniqueStemmedWords = [...new Set(stemmedWords)];
for (const stemmedWord of stemmedWords) {
const key = `${stemmedWord}!${document.id}`;
const revKey = `${document.id}!${stemmedWord}`;
const currentFreq = updateIndexMap.get(key) || 0;
updateIndexMap.set(key, currentFreq + 1);
updateReverseIndexMap.set(revKey, true);
}
// console.log("updateIndexMap", updateIndexMap);
await this.index.batchSet(
[...updateIndexMap.entries()].map((
[key, value],
) => ({ key: ["fts", ...key.split("!")], value: value })),
);
await this.index.batchSet(
[...updateReverseIndexMap.entries()].map((
[key, value],
) => ({ key: ["fts_rev", ...key.split("!")], value: value })),
);
}
// Search for a phrase and return document ids sorted by match count
public async search(phrase: string): Promise<ResultObject[]> {
const words = this.tokenize(phrase);
const filteredWords = this.removeStopWords(words);
const stemmedWords = filteredWords.map((word) => this.stem(word));
// const wordIdsArray: string[][] = await this.index.get(stemmedWords);
const matchCounts: Map<string, number> = new Map(); // pageName -> count
for (const stemmedWord of stemmedWords) {
const entries = await this.index.query({ prefix: ["fts", stemmedWord] });
for (const { key, value } of entries) {
const id = key[2];
if (matchCounts.has(id)) {
matchCounts.set(id, matchCounts.get(id)! + value);
} else {
matchCounts.set(id, value);
}
for (const stemmedWord of stemmedWords) {
const entries = await query({
prefix: ["fts", stemmedWord],
});
for (const { key, value } of entries) {
const id = key[2];
if (matchCounts.has(id)) {
matchCounts.set(id, matchCounts.get(id)! + value);
} else {
matchCounts.set(id, value);
}
}
const results = Array.from(matchCounts.entries()).map(
([id, score]) => ({ id, score }),
);
return results.sort((a, b) => b.score - a.score);
}
// Delete a document from the index
public async deleteDocument(documentId: string): Promise<void> {
const words = await this.index.query({
prefix: ["fts_rev", documentId],
});
const keysToDelete: KvKey[] = [];
for (const { key } of words) {
const word = key[2];
keysToDelete.push(["fts", word, documentId]);
keysToDelete.push(key);
}
await this.index.batchDel(keysToDelete);
}
const results = Array.from(matchCounts.entries()).map(
([id, score]) => ({ id, score }),
);
return results.sort((a, b) => b.score - a.score);
}

View File

@ -5,14 +5,6 @@ functions:
events:
- page:index
clearIndex:
path: search.ts:clearIndex
searchUnindex:
path: "./search.ts:pageUnindex"
env: client
events:
- page:deleted
searchQueryProvider:
path: ./search.ts:queryProvider
events:

View File

@ -5,15 +5,13 @@ import {
evalQueryExpression,
liftAttributeFilter,
} from "$sb/lib/query.ts";
import { datastore, editor } from "$sb/syscalls.ts";
import { SimpleSearchEngine } from "./engine.ts";
import { FileMeta, KvKey } from "$sb/types.ts";
import { editor } from "$sb/syscalls.ts";
import { FileMeta } from "$sb/types.ts";
import { PromiseQueue } from "$sb/lib/async.ts";
import { ftsIndexPage, ftsSearch } from "./engine.ts";
const searchPrefix = "🔍 ";
const engine = new SimpleSearchEngine(datastore);
// Search indexing is prone to concurrency issues, so we queue all write operations
const promiseQueue = new PromiseQueue();
@ -21,25 +19,8 @@ export function indexPage({ name, tree }: IndexTreeEvent) {
const text = renderToText(tree);
return promiseQueue.runInQueue(async () => {
// console.log("Now FTS indexing", name);
await engine.deleteDocument(name);
await engine.indexDocument({ id: name, text });
});
}
export async function clearIndex() {
const keysToDelete: KvKey[] = [];
for (const { key } of await datastore.query({ prefix: ["fts"] })) {
keysToDelete.push(key);
}
for (const { key } of await datastore.query({ prefix: ["fts_rev"] })) {
keysToDelete.push(key);
}
await datastore.batchDel(keysToDelete);
}
export function pageUnindex(pageName: string) {
return promiseQueue.runInQueue(() => {
return engine.deleteDocument(pageName);
// await engine.deleteDocument(name);
await ftsIndexPage(name, text);
});
}
@ -52,7 +33,7 @@ export async function queryProvider({
}
const phrase = evalQueryExpression(phraseFilter, {});
// console.log("Phrase", phrase);
let results: any[] = await engine.search(phrase);
let results: any[] = await ftsSearch(phrase);
// Patch the object to a format that users expect (translate id to name)
for (const r of results) {
@ -78,7 +59,7 @@ export async function readFileSearch(
searchPrefix.length,
name.length - ".md".length,
);
const results = await engine.search(phrase);
const results = await ftsSearch(phrase);
const text = `# Search results for "${phrase}"\n${
results
.map((r) => `* [[${r.id}]] (score ${r.score})`)