silverbullet/plugs/index/snippet_extractor.ts

73 lines
2.2 KiB
TypeScript
Raw Permalink Normal View History

export function extractSnippetAroundIndex(
text: string,
index: number,
maxSnippetLength: number = 100,
): string {
// Use Intl.Segmenter to segment the text into sentences
const sentenceSegmenter = new Intl.Segmenter("en", {
granularity: "sentence",
});
2024-05-28 02:33:41 +08:00
let sentences = [...sentenceSegmenter.segment(text)].map((segment) =>
segment.segment
);
2024-05-28 02:33:41 +08:00
// Manual fixes for markdown notation
const tempSentences: string[] = [];
for (let i = 0; i < sentences.length; i++) {
if (sentences[i] === "[[!" && sentences[i + 1]) {
tempSentences.push(sentences[i] + sentences[i + 1]);
i++;
} else {
tempSentences.push(sentences[i]);
}
}
sentences = tempSentences;
// Find the sentence that contains the index
let currentLength = 0;
let targetSentence = "";
for (const sentence of sentences) {
if (index >= currentLength && index < currentLength + sentence.length) {
targetSentence = sentence;
break;
}
currentLength += sentence.length;
}
// If the target sentence is within the maxSnippetLength, return it
if (targetSentence.length <= maxSnippetLength) {
return targetSentence.trim();
}
const indexInSentence = index - currentLength;
// Regex for checking if a character is a word character with unicode support
const isWordCharacter = /[\p{L}\p{N}_]/u;
// Find a reasonable word boundary to start the snippet
let snippetStartIndex = Math.max(indexInSentence - maxSnippetLength / 2, 0);
while (
snippetStartIndex > 0 &&
isWordCharacter.test(targetSentence[snippetStartIndex])
) {
snippetStartIndex--;
}
snippetStartIndex = Math.max(snippetStartIndex, 0);
// Find a reasonable word boundary to end the snippet
let snippetEndIndex = Math.min(
indexInSentence + maxSnippetLength / 2,
targetSentence.length,
);
while (
snippetEndIndex < targetSentence.length &&
isWordCharacter.test(targetSentence[snippetEndIndex])
) {
snippetEndIndex++;
}
snippetEndIndex = Math.min(snippetEndIndex, targetSentence.length);
// Extract and return the refined snippet
return "..." +
targetSentence.substring(snippetStartIndex, snippetEndIndex).trim() + "...";
}