silverbullet/plugs/index/snippet_extractor.ts

export function extractSnippetAroundIndex(
  text: string,
  index: number,
  maxSnippetLength: number = 100,
): string {
  // Use Intl.Segmenter to segment the text into sentences
  const sentenceSegmenter = new Intl.Segmenter("en", {
    granularity: "sentence",
  });
  let sentences = [...sentenceSegmenter.segment(text)].map((segment) =>
    segment.segment
  );
  // Manual fixes for markdown notation
  const tempSentences: string[] = [];
  for (let i = 0; i < sentences.length; i++) {
    if (sentences[i] === "[[!" && sentences[i + 1]) {
      tempSentences.push(sentences[i] + sentences[i + 1]);
      i++;
    } else {
      tempSentences.push(sentences[i]);
    }
  }
  sentences = tempSentences;

  // Find the sentence that contains the index
  let currentLength = 0;
  let targetSentence = "";
  for (const sentence of sentences) {
    if (index >= currentLength && index < currentLength + sentence.length) {
      targetSentence = sentence;
      break;
    }
    currentLength += sentence.length;
  }

  // If the target sentence is within the maxSnippetLength, return it
  if (targetSentence.length <= maxSnippetLength) {
    return targetSentence.trim();
  }

  const indexInSentence = index - currentLength;

  // Regex for checking if a character is a word character with unicode support
  const isWordCharacter = /[\p{L}\p{N}_]/u;

  // Find a reasonable word boundary to start the snippet
  let snippetStartIndex = Math.max(indexInSentence - maxSnippetLength / 2, 0);
  while (
    snippetStartIndex > 0 &&
    isWordCharacter.test(targetSentence[snippetStartIndex])
  ) {
    snippetStartIndex--;
  }
  snippetStartIndex = Math.max(snippetStartIndex, 0);

  // Find a reasonable word boundary to end the snippet
  let snippetEndIndex = Math.min(
    indexInSentence + maxSnippetLength / 2,
    targetSentence.length,
  );
  while (
    snippetEndIndex < targetSentence.length &&
    isWordCharacter.test(targetSentence[snippetEndIndex])
  ) {
    snippetEndIndex++;
  }
  snippetEndIndex = Math.min(snippetEndIndex, targetSentence.length);

  // Extract and return the refined snippet
  return "..." +
    targetSentence.substring(snippetStartIndex, snippetEndIndex).trim() + "...";
}
Fixes #621 with improved snippets for page links 2024-03-02 21:48:02 +08:00			`export function extractSnippetAroundIndex(`
			`text: string,`
			`index: number,`
			`maxSnippetLength: number = 100,`
			`): string {`
			`// Use Intl.Segmenter to segment the text into sentences`
			`const sentenceSegmenter = new Intl.Segmenter("en", {`
			`granularity: "sentence",`
			`});`
Index []() style links (#827) 2024-05-28 02:33:41 +08:00			`let sentences = [...sentenceSegmenter.segment(text)].map((segment) =>`
Fixes #621 with improved snippets for page links 2024-03-02 21:48:02 +08:00			`segment.segment`
			`);`
Index []() style links (#827) 2024-05-28 02:33:41 +08:00			`// Manual fixes for markdown notation`
			`const tempSentences: string[] = [];`
			`for (let i = 0; i < sentences.length; i++) {`
			`if (sentences[i] === "[[!" && sentences[i + 1]) {`
			`tempSentences.push(sentences[i] + sentences[i + 1]);`
			`i++;`
			`} else {`
			`tempSentences.push(sentences[i]);`
			`}`
			`}`
			`sentences = tempSentences;`
Fixes #621 with improved snippets for page links 2024-03-02 21:48:02 +08:00
			`// Find the sentence that contains the index`
			`let currentLength = 0;`
			`let targetSentence = "";`
			`for (const sentence of sentences) {`
			`if (index >= currentLength && index < currentLength + sentence.length) {`
			`targetSentence = sentence;`
			`break;`
			`}`
			`currentLength += sentence.length;`
			`}`

			`// If the target sentence is within the maxSnippetLength, return it`
			`if (targetSentence.length <= maxSnippetLength) {`
			`return targetSentence.trim();`
			`}`

			`const indexInSentence = index - currentLength;`

			`// Regex for checking if a character is a word character with unicode support`
			`const isWordCharacter = /[\p{L}\p{N}_]/u;`

			`// Find a reasonable word boundary to start the snippet`
			`let snippetStartIndex = Math.max(indexInSentence - maxSnippetLength / 2, 0);`
			`while (`
			`snippetStartIndex > 0 &&`
			`isWordCharacter.test(targetSentence[snippetStartIndex])`
			`) {`
			`snippetStartIndex--;`
			`}`
			`snippetStartIndex = Math.max(snippetStartIndex, 0);`

			`// Find a reasonable word boundary to end the snippet`
			`let snippetEndIndex = Math.min(`
			`indexInSentence + maxSnippetLength / 2,`
			`targetSentence.length,`
			`);`
			`while (`
			`snippetEndIndex < targetSentence.length &&`
			`isWordCharacter.test(targetSentence[snippetEndIndex])`
			`) {`
			`snippetEndIndex++;`
			`}`
			`snippetEndIndex = Math.min(snippetEndIndex, targetSentence.length);`

			`// Extract and return the refined snippet`
			`return "..." +`
			`targetSentence.substring(snippetStartIndex, snippetEndIndex).trim() + "...";`
			`}`