docs/app/components/command-k/create-search-index.ts at d2c3788622e64c05d4ddda061d1e464f8d7ed575 · code-forge-io/docs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import type { Page } from "content-collections"
import slug from "slug"
import { getPageSlug } from "~/utils/get-page-slug"

function cleanParagraph(raw: string) {
	return (
		raw
			// strip inline code, bold, italics
			.replace(/`([^`]+)`/g, "$1")
			.replace(/\*\*([^*]+)\*\*/g, "$1")
			.replace(/\*([^*]+)\*/g, "$1")
			.replace(/_(.+?)_/g, "$1")
			// strip markdown links [text](url)
			.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
			// strip mdx attributes { ... } inline
			.replace(/\{[^}]*\}/g, "")
			// list bullets / ordered list markers at line start
			.replace(/^\s*[-*+]\s+/gm, "")
			.replace(/^\s*\d+\.\s+/gm, "")
			// collapse whitespace
			.replace(/\n{2,}/g, "\n")
			.replace(/[ \t]+/g, " ")
			.trim()
	)
}

function stripCodeFences(src: string) {
	return src.replace(/```[\s\S]*?```/g, "")
}

function splitIntoParagraphs(src: string) {
	return src
		.split(/\n\s*\n/g)
		.map(cleanParagraph)
		.filter((p) => p.length > 0)
}

const extractHeadingData = (match: RegExpMatchArray) => {
	const [fullMatch, hashes, text] = match
	return {
		level: hashes.length,
		text,
		index: match.index || 0,
		length: fullMatch.length,
	}
}

function extractHeadingSections(rawMdx: string) {
	const src = stripCodeFences(rawMdx)
	const headingRegex = /^(#{1,6})\s+(.+?)\s*$/gm
	const matches = Array.from(src.matchAll(headingRegex), extractHeadingData)

	const usedAnchors = new Set<string>()

	const createUniqueAnchor = (baseAnchor: string) => {
		let unique = baseAnchor
		let n = 2
		while (usedAnchors.has(unique)) {
			unique = `${baseAnchor}-${n++}`
		}
		usedAnchors.add(unique)
		return unique
	}

	const cleanHeadingText = (text: string) =>
		text
			.replace(/`([^`]+)`/g, "$1")
			.replace(/\*\*([^*]+)\*\*/g, "$1")
			.replace(/\*([^*]+)\*/g, "$1")
			.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
			.replace(/\{[^}]*\}/g, "")
			.trim()

	if (matches.length === 0) {
		const paragraphs = splitIntoParagraphs(src)
		return paragraphs.length ? [{ heading: "_intro", anchor: "_intro", paragraphs }] : []
	}

	const sections = []

	// we are adding intro section if content exists before first heading
	const introBlock = src.slice(0, matches[0].index).trim()
	if (introBlock) {
		const introParas = splitIntoParagraphs(introBlock)
		if (introParas.length) {
			sections.push({ heading: "_intro", anchor: "_intro", paragraphs: introParas })
		}
	}

	matches.forEach((match, i) => {
		const nextMatch = matches[i + 1]
		const block = src.slice(match.index + match.length, nextMatch?.index).trim()

		const rawHeading = cleanHeadingText(match.text)
		const baseAnchor = slug(rawHeading) || "_section"
		const anchor = createUniqueAnchor(baseAnchor)
		const paragraphs = splitIntoParagraphs(block)

		sections.push({
			heading: rawHeading,
			anchor,
			paragraphs,
		})
	})

	return sections
}

export function createSearchIndex(pages: Page[]) {
	return pages
		.filter((page) => page.slug !== "_index")
		.flatMap((page) => {
			const pageSlug = getPageSlug(page)
			const pageUrl = pageSlug.startsWith("/") ? pageSlug : `/${pageSlug}`
			const sections = extractHeadingSections(page.rawMdx)
			return sections.map((section) => {
				const heading = section.heading === "_intro" ? page.title : section.heading

				return {
					id: `${pageUrl}#${section.anchor}`,
					title: page.title,
					subtitle: heading,
					paragraphs: [heading, ...section.paragraphs],
				}
			})
		})
}