diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 31ef76865..ccc9bc59c 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -2,6 +2,10 @@ on: [push, pull_request] name: CI Linux +# Least privilege: this workflow only reads the repo and runs tests. +permissions: + contents: read + jobs: test-amd64: name: rust-libxml amd64 CI @@ -11,41 +15,29 @@ jobs: with_default_bindings: [false, true] steps: - name: install dependencies - uses: ryankurte/action-apt@v0.2.0 - with: - packages: "libxml2-dev" + run: | + sudo apt-get update + sudo apt-get install -y libxml2-dev - name: Set up LIBXML2 env var if compiling with the default bindings run: echo "LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so" >> "$GITHUB_ENV" if: ${{ matrix.with_default_bindings }} - - uses: actions/checkout@v2 - - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: stable - override: true + - uses: actions/checkout@v6 + - uses: dtolnay/rust-toolchain@stable - name: run tests - uses: actions-rs/cargo@v1 - with: - command: test + run: cargo test test-arm64: name: rust-libxml arm64 CI runs-on: ubuntu-24.04-arm steps: - name: install dependencies - uses: ryankurte/action-apt@v0.2.0 - with: - packages: "libxml2-dev" - - uses: actions/checkout@v2 - - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: stable - override: true + run: | + sudo apt-get update + sudo apt-get install -y libxml2-dev + - uses: actions/checkout@v6 + - uses: dtolnay/rust-toolchain@stable - name: run tests - uses: actions-rs/cargo@v1 - with: - command: test + run: cargo test test-newer-libxml2: strategy: @@ -55,10 +47,10 @@ jobs: runs-on: ubuntu-latest steps: - name: install dependencies - uses: ryankurte/action-apt@v0.2.0 - with: - packages: "libpython3-dev" - - uses: actions/checkout@v2 + run: | + sudo apt-get update + sudo apt-get install -y libpython3-dev + - uses: actions/checkout@v6 - name: Install libxml ${{ matrix.libxml_version }} by hand run: | wget https://download.gnome.org/sources/libxml2/$(echo ${{ matrix.libxml_version }} | sed -e 's/\.[0-9]*$//')/libxml2-${{ matrix.libxml_version }}.tar.xz @@ -67,14 +59,8 @@ jobs: ./configure make sudo make install - - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: stable - override: true + - uses: dtolnay/rust-toolchain@stable - name: run tests - uses: actions-rs/cargo@v1 - with: - command: test + run: cargo test env: LD_LIBRARY_PATH: /usr/local/lib diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml index cfc127cdb..770579caf 100644 --- a/.github/workflows/gh-pages.yml +++ b/.github/workflows/gh-pages.yml @@ -4,6 +4,11 @@ on: - master name: Publish Docs + +# Deploy step pushes the rendered docs to the gh-pages branch. +permissions: + contents: write + jobs: publish_docs: if: github.repository == 'KWARC/rust-libxml' @@ -11,23 +16,16 @@ jobs: runs-on: ubuntu-latest steps: - name: install dependencies - uses: ryankurte/action-apt@v0.2.0 - with: - packages: "libxml2-dev" + run: | + sudo apt-get update + sudo apt-get install -y libxml2-dev - name: Set up LIBXML2 env var if compiling with the default bindings run: echo "LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so" >> "$GITHUB_ENV" - - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: stable - override: true - name: Checkout sources - uses: actions/checkout@v4 + uses: actions/checkout@v6 + - uses: dtolnay/rust-toolchain@stable - name: Build Documentation - uses: actions-rs/cargo@v1 - with: - command: doc - args: --all --no-deps + run: cargo doc --all --no-deps - name: Deploy Documentation env: diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 55251c3ed..d885e2a10 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -2,16 +2,20 @@ on: [push, pull_request] name: CI Windows +# Least privilege: this workflow only reads the repo and runs tests. +permissions: + contents: read + jobs: test-default-windows: name: Windows vcpkg (default) runs-on: windows-latest - env: + env: VCPKGRS_DYNAMIC: 1 VCPKG_DEFAULT_TRIPLET: x64-windows VCPKG_ROOT: C:\vcpkg steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Setup vcpkg libxml2 Cache uses: actions/cache@v4 id: vcpkg-cache @@ -23,9 +27,7 @@ jobs: vcpkg install libxml2:x64-windows vcpkg integrate install - name: run tests - uses: actions-rs/cargo@v1 - with: - command: test + run: cargo test test-mingw64-windows: name: Windows (mingw64) @@ -34,7 +36,7 @@ jobs: run: shell: msys2 {0} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: msys2/setup-msys2@v2 with: path-type: minimal @@ -45,14 +47,18 @@ jobs: mingw64/mingw-w64-x86_64-pkg-config mingw64/mingw-w64-x86_64-libxml2 - name: Install stable windows-gnu Rust toolchain - uses: actions-rs/toolchain@v1 + uses: dtolnay/rust-toolchain@master with: toolchain: stable-x86_64-pc-windows-gnu - target: x86_64-pc-windows-gnu - override: true + targets: x86_64-pc-windows-gnu - name: Ensure mingw64 pkg-config is in path run: echo "C:\msys64\mingw64\bin" >> "$GITHUB_PATH" + # Run cargo in the default Windows shell, not the job's msys2 shell: + # rustup installs cargo to the Windows user profile, which + # `path-type: minimal` strips from the msys2 PATH (exit 127). The old + # actions-rs/cargo step ran in the runner's Windows context too, never + # msys2. mingw64/bin is on PATH via the step above, so pkg-config, gcc, + # and the libxml2 DLLs still resolve for the windows-gnu build. - name: run tests - uses: actions-rs/cargo@v1 - with: - command: test \ No newline at end of file + shell: pwsh + run: cargo test diff --git a/CHANGELOG.md b/CHANGELOG.md index 98eb16033..b092e9e57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,20 @@ # Change Log -## [0.3.12] (in development) +## [0.3.13] (in development) + +## [0.3.12] (2026-05-23) + +### Added + +* New `io` module: `io::register_input_callback(match_url, open)` is + a safe wrapper around `xmlRegisterInputCallbacks`. Accepts two + closures (`&str -> bool`, `&str -> Option>`) and installs + them as a custom URL-scheme handler. The intended use is bundling + XSLT stylesheets / RNG schemas via `include_bytes!` and serving + them through a synthetic scheme (e.g. `embed:///foo.xsl`), so + libxslt's `xsl:import` resolution can reach the embedded bytes + without touching the disk. Closures may run on any thread libxml2 + calls them from (`Send + Sync + 'static`). ## [0.3.11] (2026-05-18) diff --git a/Cargo.toml b/Cargo.toml index 736dd719e..a58eb7ab9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libxml" -version = "0.3.11" +version = "0.3.12" edition = "2024" authors = ["Andreas Franzén ", "Deyan Ginev ","Jan Frederik Schaefer "] description = "A Rust wrapper for libxml2 - the XML C parser and toolkit developed for the Gnome project" diff --git a/src/io.rs b/src/io.rs new file mode 100644 index 000000000..645316142 --- /dev/null +++ b/src/io.rs @@ -0,0 +1,297 @@ +//! Custom I/O callbacks for libxml2. +//! +//! libxml2 routes every URL it loads (XML documents, `xsl:import` / +//! `xsl:include` targets, RelaxNG ``, DTD external subsets, +//! etc.) through a chain of registered "input callback" handlers. The +//! default chain handles `file://`, `http://`, `ftp://`, etc.; an +//! application can prepend its own handler for a custom URL scheme via +//! `xmlRegisterInputCallbacks`. +//! +//! This module wraps that C API in a safe, closure-friendly surface. +//! The motivating use case is shipping a single-binary CLI that bundles +//! XSLT stylesheets / RNG schemas via `include_bytes!` and serves them +//! through a synthetic scheme (e.g. `embed:///foo.xsl`), so `xsl:import` +//! chains resolve without ever touching the disk. +//! +//! ```no_run +//! use libxml::io; +//! +//! static MAIN: &[u8] = b"\n"; +//! +//! io::register_input_callback( +//! |url| url.starts_with("embed:///"), +//! |url| match url.strip_prefix("embed:///") { +//! Some("main.xml") => Some(MAIN.to_vec()), +//! _ => None, +//! }, +//! ); +//! ``` +//! +//! ## Lifetime, threading, order +//! +//! Closures live for the process lifetime — libxml2 has no per-handler +//! unregister API. They may run on any thread (hence `Send + Sync`) and +//! **must not panic**: unwinding across the `extern "C"` trampoline +//! aborts on Rust 2024+. libxml2 walks callbacks newest-first; the +//! trampolines snapshot the registry and drop the lock before invoking +//! a user closure, so a closure that re-enters libxml2 won't +//! self-deadlock. If `open` returns `None`, libxml2 falls through to +//! the next handler — including its default file/HTTP loaders. + +use std::ffi::{CStr, c_char, c_int, c_void}; +use std::sync::{Arc, Mutex, OnceLock}; + +use crate::bindings::xmlRegisterInputCallbacks; + +type MatchFn = Box bool + Send + Sync + 'static>; +type OpenFn = Box Option> + Send + Sync + 'static>; + +struct Callback { + match_url: MatchFn, + open: OpenFn, +} + +fn callbacks() -> &'static Mutex>> { + static CALLBACKS: OnceLock>>> = OnceLock::new(); + CALLBACKS.get_or_init(|| Mutex::new(Vec::new())) +} + +/// Clone the registry under the lock and return with the guard +/// dropped. Cloning is a cheap refcount bump per entry. The lock is +/// not held across user closures, so a closure that re-enters libxml2 +/// can't self-deadlock on the registry mutex. +fn snapshot() -> Vec> { + callbacks().lock().unwrap().clone() +} + +/// Register a custom input callback with libxml2. +/// +/// `match_url` is consulted for every URL libxml2 considers loading. +/// Return `true` to claim the URL; the same callback's `open` is then +/// invoked. Either function can defer: `match_url` returning `false` +/// skips the callback; `open` returning `None` falls through to the +/// next registered handler, including libxml2's defaults. +/// +/// Closures are `Send + Sync + 'static` because libxml2 may invoke +/// them from any thread. They are appended to a process-static +/// registry; there is no per-handler unregister. +/// +/// # Example +/// +/// Most commonly used to resolve `xsl:import` / `xsl:include` via +/// libxslt, or RelaxNG `` via `xmlRelaxNGParse`. Note that +/// this crate's own `Parser::parse_file` uses Rust file I/O directly +/// and bypasses libxml2's URL machinery, so it does *not* trigger +/// these callbacks — wire them up via libxslt's `parse_bytes` or +/// libxml2's `xmlReadFile`. +/// +/// ```no_run +/// use libxml::io; +/// +/// static HELLO: &[u8] = b"\nworld"; +/// +/// io::register_input_callback( +/// |url| url.starts_with("embed:///"), +/// |url| (url == "embed:///hello.xml").then(|| HELLO.to_vec()), +/// ); +/// ``` +pub fn register_input_callback(match_url: M, open: O) +where + M: Fn(&str) -> bool + Send + Sync + 'static, + O: Fn(&str) -> Option> + Send + Sync + 'static, +{ + callbacks().lock().unwrap().push(Arc::new(Callback { + match_url: Box::new(match_url), + open: Box::new(open), + })); + + // libxml2 records the trampoline pointers in a static table; + // registering twice would push duplicate entries that delegate to + // the same Rust registry. The OnceLock avoids that. + static REGISTERED: OnceLock<()> = OnceLock::new(); + REGISTERED.get_or_init(|| { + crate::init_parser(); + unsafe { + xmlRegisterInputCallbacks( + Some(trampoline_match), + Some(trampoline_open), + Some(trampoline_read), + Some(trampoline_close), + ); + } + }); +} + +/// Per-open state owned by libxml2 via `*mut c_void` until +/// `trampoline_close` reclaims and drops it. +struct OpenState { + bytes: Vec, + position: usize, +} + +unsafe extern "C" fn trampoline_match(filename: *const c_char) -> c_int { + if filename.is_null() { + return 0; + } + // SAFETY: libxml2 guarantees `filename` is a NUL-terminated C string + // for the call's lifetime. Non-UTF-8 URLs can't match anyway. + let url = match unsafe { CStr::from_ptr(filename) }.to_str() { + Ok(s) => s, + Err(_) => return 0, + }; + // Newest-first, mirroring `trampoline_open`'s walk. + for cb in snapshot().iter().rev() { + if (cb.match_url)(url) { + return 1; + } + } + 0 +} + +unsafe extern "C" fn trampoline_open(filename: *const c_char) -> *mut c_void { + if filename.is_null() { + return std::ptr::null_mut(); + } + // SAFETY: see `trampoline_match`. + let url = match unsafe { CStr::from_ptr(filename) }.to_str() { + Ok(s) => s, + Err(_) => return std::ptr::null_mut(), + }; + // Newest-first — the most recent registration wins. + for cb in snapshot().iter().rev() { + if !(cb.match_url)(url) { + continue; + } + if let Some(bytes) = (cb.open)(url) { + return Box::into_raw(Box::new(OpenState { bytes, position: 0 })) as *mut c_void; + } + } + std::ptr::null_mut() +} + +unsafe extern "C" fn trampoline_read( + context: *mut c_void, + buffer: *mut c_char, + len: c_int, +) -> c_int { + if context.is_null() || buffer.is_null() || len <= 0 { + return -1; + } + // SAFETY: `context` came from `Box::into_raw` in `trampoline_open` + // and is not yet reclaimed; libxml2 holds one reference per handle. + let state = unsafe { &mut *(context as *mut OpenState) }; + let remaining = state.bytes.len().saturating_sub(state.position); + let n = remaining.min(len as usize); + if n == 0 { + return 0; + } + // SAFETY: bounds checked above; src and dst are disjoint allocations. + unsafe { + std::ptr::copy_nonoverlapping( + state.bytes.as_ptr().add(state.position), + buffer as *mut u8, + n, + ); + } + state.position += n; + n as c_int +} + +unsafe extern "C" fn trampoline_close(context: *mut c_void) -> c_int { + if context.is_null() { + return -1; + } + // SAFETY: unique reclamation site for the box from `trampoline_open`. + let _state = unsafe { Box::from_raw(context as *mut OpenState) }; + 0 +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::bindings::{xmlFreeDoc, xmlReadFile}; + use std::ffi::CString; + use std::sync::atomic::{AtomicUsize, Ordering}; + + static SAMPLE_XML: &[u8] = br#" +"#; + + /// `Parser::parse_file` bypasses the input-callback machinery (it + /// reads via Rust file I/O), so the test must call `xmlReadFile` + /// directly — the same entry point libxslt uses for `xsl:import`. + fn read_file_via_libxml2(url: &str) -> bool { + let c = CString::new(url).unwrap(); + unsafe { + let doc = xmlReadFile(c.as_ptr(), std::ptr::null(), 0); + if doc.is_null() { + return false; + } + xmlFreeDoc(doc); + true + } + } + + /// Scenarios share one `#[test]` so they run sequentially. libxml2 + /// < 2.13 has a thread-safety bug in the input-callback path that + /// deadlocks concurrent `xmlReadFile` calls under cargo's default + /// parallel test runner. + #[test] + fn input_callback_scenarios() { + register_input_callback( + |url| url.starts_with("embed:///"), + |url| (url == "embed:///sample.xml").then(|| SAMPLE_XML.to_vec()), + ); + + // 1. Happy path. + assert!(read_file_via_libxml2("embed:///sample.xml")); + + // 2. `open` returning `None` declines this match; libxml2 falls + // through to the default file loader, which also fails. + assert!(!read_file_via_libxml2("embed:///unknown.xml")); + + // 3. Unrelated URLs aren't claimed by our match — they reach the + // default file handler and fail there. + assert!(!read_file_via_libxml2("/nonexistent/definitely/missing.xml")); + + // 4. Re-entrancy: an `open` closure that calls into libxml2 must + // not self-deadlock on the registry mutex. + register_input_callback( + |url| url == "reentrant:///outer", + |_url| { + let _ = read_file_via_libxml2("embed:///sample.xml"); + Some(SAMPLE_XML.to_vec()) + }, + ); + assert!(read_file_via_libxml2("reentrant:///outer")); + + // 5. Newest-wins ordering: two callbacks claim the same URL; only + // the most recent registration runs and produces the bytes. + static FIRST_OPENED: AtomicUsize = AtomicUsize::new(0); + static SECOND_OPENED: AtomicUsize = AtomicUsize::new(0); + register_input_callback( + |url| url == "ordered:///x", + |_| { + FIRST_OPENED.fetch_add(1, Ordering::SeqCst); + Some(b"first".to_vec()) + }, + ); + register_input_callback( + |url| url == "ordered:///x", + |_| { + SECOND_OPENED.fetch_add(1, Ordering::SeqCst); + Some(SAMPLE_XML.to_vec()) + }, + ); + assert!(read_file_via_libxml2("ordered:///x")); + assert_eq!( + SECOND_OPENED.load(Ordering::SeqCst), + 1, + "newest registration should run", + ); + assert_eq!( + FIRST_OPENED.load(Ordering::SeqCst), + 0, + "older registration should not be consulted", + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index dd1ed88af..df59e9c07 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,6 +27,11 @@ pub mod schemas; /// Read-only parallel primitives pub mod readonly; +/// Custom input callbacks for `xmlRegisterInputCallbacks` — bundle +/// XSLT stylesheets / RNG schemas inside the binary and serve them +/// through a user-defined URL scheme (e.g. `embed:///foo.xsl`). +pub mod io; + /// Ensure libxml2's global parser state is initialised. Safe to call from /// any number of threads — internally guarded by `std::sync::Once` so the /// underlying `xmlInitParser()` runs exactly once. Call this before