2023-05-24 02:53:53 +08:00
|
|
|
import { stemmer } from "https://esm.sh/porter-stemmer@0.9.1";
|
|
|
|
|
|
|
|
export type Document = {
|
|
|
|
id: string;
|
|
|
|
text: string;
|
|
|
|
};
|
|
|
|
|
2023-08-31 03:12:33 +08:00
|
|
|
export interface BatchKVStore {
|
|
|
|
get(keys: string[]): Promise<(any | undefined)[]>;
|
|
|
|
set(entries: Map<string, any>): Promise<void>;
|
|
|
|
delete(keys: string[]): Promise<void>;
|
2023-05-24 02:53:53 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
type ResultObject = {
|
|
|
|
score: number;
|
|
|
|
id: string;
|
|
|
|
};
|
|
|
|
|
|
|
|
export class SimpleSearchEngine {
|
|
|
|
private stopWords = ["and", "or", "the", "a", "an"];
|
|
|
|
|
|
|
|
constructor(
|
2023-08-31 03:12:33 +08:00
|
|
|
public index: BatchKVStore,
|
|
|
|
public reverseIndex: BatchKVStore,
|
2023-05-24 02:53:53 +08:00
|
|
|
) {
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tokenize text into words
|
|
|
|
private tokenize(text: string): string[] {
|
|
|
|
return text.toLowerCase().split(/[^\p{L}]+/u);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove stop words from array of words
|
|
|
|
private removeStopWords(words: string[]): string[] {
|
|
|
|
return words.filter((word) =>
|
|
|
|
word.length > 2 &&
|
|
|
|
!this.stopWords.includes(word) && /^\p{L}+$/u.test(word)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Basic stemming function
|
|
|
|
private stem(word: string): string {
|
|
|
|
return stemmer(word);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Index an array of documents
|
|
|
|
public async indexDocument(document: Document): Promise<void> {
|
|
|
|
const updateIndexMap = new Map<string, string[]>();
|
|
|
|
const updateReverseIndexMap = new Map<string, string[]>();
|
|
|
|
|
2023-07-17 04:45:26 +08:00
|
|
|
const pageContent = this.tokenize(document.text);
|
|
|
|
const pageName = this.tokenize(document.id);
|
|
|
|
const words = [...pageContent, ...pageName];
|
2023-05-24 02:53:53 +08:00
|
|
|
const filteredWords = this.removeStopWords(words);
|
|
|
|
const stemmedWords = filteredWords.map((word) => this.stem(word));
|
|
|
|
|
|
|
|
// Get the current IDs for these stemmed words
|
|
|
|
const uniqueStemmedWords = [...new Set(stemmedWords)];
|
|
|
|
const currentIdsArray = await this.index.get(uniqueStemmedWords);
|
|
|
|
|
2023-08-28 23:12:15 +08:00
|
|
|
stemmedWords.forEach((stemmedWord) => {
|
2023-05-24 02:53:53 +08:00
|
|
|
const currentIds =
|
|
|
|
currentIdsArray[uniqueStemmedWords.indexOf(stemmedWord)] || [];
|
|
|
|
|
|
|
|
currentIds.push(document.id);
|
|
|
|
updateIndexMap.set(stemmedWord, currentIds);
|
|
|
|
|
|
|
|
if (!updateReverseIndexMap.has(document.id)) {
|
|
|
|
updateReverseIndexMap.set(document.id, []);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!updateReverseIndexMap.get(document.id)!.includes(stemmedWord)) {
|
|
|
|
updateReverseIndexMap.get(document.id)!.push(stemmedWord);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
// console.log("updateIndexMap", updateIndexMap);
|
|
|
|
|
|
|
|
await this.index.set(updateIndexMap);
|
|
|
|
await this.reverseIndex.set(updateReverseIndexMap);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Search for a phrase and return document ids sorted by match count
|
|
|
|
public async search(phrase: string): Promise<ResultObject[]> {
|
|
|
|
const words = this.tokenize(phrase);
|
|
|
|
const filteredWords = this.removeStopWords(words);
|
|
|
|
const stemmedWords = filteredWords.map((word) => this.stem(word));
|
|
|
|
|
2023-08-31 03:12:33 +08:00
|
|
|
const wordIdsArray: string[][] = await this.index.get(stemmedWords);
|
2023-05-24 02:53:53 +08:00
|
|
|
const matchCounts: Map<string, number> = new Map();
|
|
|
|
|
|
|
|
wordIdsArray.forEach((wordIds) => {
|
|
|
|
if (wordIds) {
|
|
|
|
wordIds.forEach((id) => {
|
|
|
|
if (matchCounts.has(id)) {
|
|
|
|
matchCounts.set(id, matchCounts.get(id)! + 1);
|
|
|
|
} else {
|
|
|
|
matchCounts.set(id, 1);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
const results = Array.from(matchCounts.entries()).map(
|
|
|
|
([id, score]) => ({ id, score }),
|
|
|
|
);
|
|
|
|
|
|
|
|
return results.sort((a, b) => b.score - a.score);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Delete a document from the index
|
|
|
|
public async deleteDocument(documentId: string): Promise<void> {
|
2023-08-31 03:12:33 +08:00
|
|
|
const words: string[][] = await this.reverseIndex.get([documentId]);
|
2023-05-24 02:53:53 +08:00
|
|
|
if (words && words[0]) {
|
2023-08-31 03:12:33 +08:00
|
|
|
const currentIdsArray: string[][] = await this.index.get(words[0]);
|
2023-05-24 02:53:53 +08:00
|
|
|
const deleteKeys: string[] = [];
|
|
|
|
const updateMap = new Map<string, string[]>();
|
|
|
|
|
2023-08-31 03:12:33 +08:00
|
|
|
words[0].forEach((word: string, i: number) => {
|
2023-05-24 02:53:53 +08:00
|
|
|
const currentIds = currentIdsArray[i];
|
|
|
|
if (currentIds) {
|
|
|
|
const updatedIds = currentIds.filter((id) => id !== documentId);
|
|
|
|
if (updatedIds.length > 0) {
|
|
|
|
updateMap.set(word, updatedIds);
|
|
|
|
} else {
|
|
|
|
deleteKeys.push(word);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
if (deleteKeys.length > 0) {
|
|
|
|
await this.index.delete(deleteKeys);
|
|
|
|
}
|
|
|
|
if (updateMap.size > 0) {
|
|
|
|
await this.index.set(updateMap);
|
|
|
|
}
|
|
|
|
|
|
|
|
await this.reverseIndex.delete([documentId]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|