Skip to content

Commit 1eae85e

Browse files
committed
feat: strip inline data URI images from LLM markdown output
Replace base64-encoded and other data URI images with text placeholders (using alt text when available) during HTML-to-markdown conversion. This prevents large base64 strings from wasting LLM context tokens.
1 parent f3ec6e2 commit 1eae85e

2 files changed

Lines changed: 33 additions & 0 deletions

File tree

R/build-llm.R

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ convert_md <- function(src_path, dst_path, url = NULL) {
7070
simplify_popovers_to_footnotes(main_html)
7171
simplify_lifecycle_badges(main_html)
7272
simplify_dls(main_html)
73+
simplify_inline_images(main_html)
7374
create_absolute_links(main_html, url)
7475

7576
path <- file_temp()
@@ -182,6 +183,22 @@ simplify_lifecycle_badges <- function(html) {
182183
invisible()
183184
}
184185

186+
simplify_inline_images <- function(html) {
187+
img_nodes <- xml2::xml_find_all(html, ".//img[contains(@src, 'data:')]")
188+
189+
purrr::walk(img_nodes, function(img) {
190+
alt_text <- xml2::xml_attr(img, "alt")
191+
replacement <- if (!is.na(alt_text) && nzchar(alt_text)) {
192+
sprintf("[Image: %s]", alt_text)
193+
} else {
194+
"[Image]"
195+
}
196+
xml2::xml_replace(img, "span", replacement)
197+
})
198+
199+
invisible()
200+
}
201+
185202
create_absolute_links <- function(main_html, url = NULL) {
186203
a <- xml2::xml_find_all(main_html, ".//a")
187204
xml2::xml_attr(a, "class") <- NULL

tests/testthat/test-build-llm.R

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,22 @@ test_that("replaces lifecycle badges with strong text", {
3131
)
3232
})
3333
34+
test_that("replaces inline data URI images with text placeholders", {
35+
html <- xml2::read_html(
36+
'<img src="data:image/png;base64,abc123" alt="A plot">'
37+
)
38+
simplify_inline_images(html)
39+
expect_equal(xpath_text(html, ".//span"), "[Image: A plot]")
40+
})
41+
42+
test_that("replaces inline data URI images without alt text", {
43+
html <- xml2::read_html(
44+
'<img src="data:image/png;base64,abc123">'
45+
)
46+
simplify_inline_images(html)
47+
expect_equal(xpath_text(html, ".//span"), "[Image]")
48+
})
49+
3450
test_that("converts internal urls to absolute with .md ending", {
3551
html <- xml2::read_html(
3652
r"(

0 commit comments

Comments
 (0)