2024-03-16 22:29:24 +08:00
|
|
|
import { stemmer } from "porter-stemmer";
|
2023-10-03 21:24:07 +08:00
|
|
|
import { batchSet, query } from "../index/plug_api.ts";
|
2023-05-24 02:53:53 +08:00
|
|
|
|
|
|
|
type ResultObject = {
|
|
|
|
score: number;
|
|
|
|
id: string;
|
|
|
|
};
|
|
|
|
|
2023-10-03 21:24:07 +08:00
|
|
|
const stopWords = ["and", "or", "the", "a", "an"];
|
2023-05-24 02:53:53 +08:00
|
|
|
|
2023-10-03 21:24:07 +08:00
|
|
|
// Tokenize text into words
|
|
|
|
function tokenize(text: string): string[] {
|
|
|
|
return text.toLowerCase().split(/[^\p{L}]+/u);
|
|
|
|
}
|
2023-05-24 02:53:53 +08:00
|
|
|
|
2023-10-03 21:24:07 +08:00
|
|
|
// Remove stop words from array of words
|
|
|
|
function removeStopWords(words: string[]): string[] {
|
|
|
|
return words.filter((word) =>
|
|
|
|
word.length > 2 &&
|
|
|
|
!stopWords.includes(word) && /^\p{L}+$/u.test(word)
|
|
|
|
);
|
|
|
|
}
|
2023-05-24 02:53:53 +08:00
|
|
|
|
2023-10-03 21:24:07 +08:00
|
|
|
// Basic stemming function
|
|
|
|
function stem(word: string): string {
|
|
|
|
return stemmer(word);
|
|
|
|
}
|
2023-05-24 02:53:53 +08:00
|
|
|
|
2023-10-03 21:24:07 +08:00
|
|
|
// Index an array of documents
|
|
|
|
export async function ftsIndexPage(
|
|
|
|
pageName: string,
|
|
|
|
text: string,
|
|
|
|
): Promise<void> {
|
|
|
|
const updateIndexMap = new Map<string, number>(); // word!id -> count
|
|
|
|
|
|
|
|
const pageNameTokens = tokenize(pageName);
|
|
|
|
const pageContentTokens = tokenize(text);
|
|
|
|
const words = [...pageNameTokens, ...pageContentTokens];
|
|
|
|
const filteredWords = removeStopWords(words);
|
|
|
|
const stemmedWords = filteredWords.map(stem);
|
|
|
|
|
|
|
|
// Get the current IDs for these stemmed words
|
|
|
|
// const uniqueStemmedWords = [...new Set(stemmedWords)];
|
|
|
|
|
|
|
|
for (const stemmedWord of stemmedWords) {
|
|
|
|
const currentFreq = updateIndexMap.get(stemmedWord) || 0;
|
|
|
|
updateIndexMap.set(stemmedWord, currentFreq + 1);
|
2023-05-24 02:53:53 +08:00
|
|
|
}
|
|
|
|
|
2023-10-03 21:24:07 +08:00
|
|
|
// console.log("updateIndexMap", updateIndexMap);
|
2023-05-24 02:53:53 +08:00
|
|
|
|
2023-10-03 21:24:07 +08:00
|
|
|
await batchSet(
|
|
|
|
pageName,
|
|
|
|
[...updateIndexMap.entries()].map((
|
|
|
|
[key, value],
|
|
|
|
) => ({ key: ["fts", key], value })),
|
|
|
|
);
|
|
|
|
}
|
2023-05-24 02:53:53 +08:00
|
|
|
|
2023-10-03 21:24:07 +08:00
|
|
|
// Search for a phrase and return document ids sorted by match count
|
|
|
|
export async function ftsSearch(phrase: string): Promise<ResultObject[]> {
|
|
|
|
const words = tokenize(phrase);
|
|
|
|
const filteredWords = removeStopWords(words);
|
|
|
|
const stemmedWords = filteredWords.map((word) => stem(word));
|
2023-05-24 02:53:53 +08:00
|
|
|
|
2023-10-03 21:24:07 +08:00
|
|
|
// const wordIdsArray: string[][] = await this.index.get(stemmedWords);
|
|
|
|
const matchCounts: Map<string, number> = new Map(); // pageName -> count
|
2023-05-24 02:53:53 +08:00
|
|
|
|
2023-10-03 21:24:07 +08:00
|
|
|
for (const stemmedWord of stemmedWords) {
|
|
|
|
const entries = await query({
|
|
|
|
prefix: ["fts", stemmedWord],
|
2023-10-03 20:16:33 +08:00
|
|
|
});
|
2023-10-03 21:24:07 +08:00
|
|
|
for (const { key, value } of entries) {
|
|
|
|
const id = key[2];
|
|
|
|
if (matchCounts.has(id)) {
|
|
|
|
matchCounts.set(id, matchCounts.get(id)! + value);
|
|
|
|
} else {
|
|
|
|
matchCounts.set(id, value);
|
|
|
|
}
|
2023-05-24 02:53:53 +08:00
|
|
|
}
|
|
|
|
}
|
2023-10-03 21:24:07 +08:00
|
|
|
|
|
|
|
const results = Array.from(matchCounts.entries()).map(
|
|
|
|
([id, score]) => ({ id, score }),
|
|
|
|
);
|
|
|
|
|
|
|
|
return results.sort((a, b) => b.score - a.score);
|
2023-05-24 02:53:53 +08:00
|
|
|
}
|